Split a data frame into overlapping dataframes - r

I'm trying to write a function that behaves as follows, but it is proving very difficult:
DF <- data.frame(x = seq(1,10), y = rep(c('a','b','c','d','e'),2))
> DF
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
>OverLapSplit(DF,nsplits=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=4)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
[[2]]
x y
1 4 e
2 5 a
3 6 b
4 7 c
5 8 d
6 9 e
7 10 a
>OverLapSplit(DF,nsplits=5,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
1 3 c
2 4 d
3 5 e
[[3]]
x y
1 5 e
2 6 a
3 7 b
[[4]]
x y
1 7 b
2 8 c
3 9 d
[[5]]
x y
1 8 d
2 9 e
3 10 f
I haven't thought a lot about what would happen if you tried something like OverLapSplit(DF,nsplits=2,overlap=1)
Maybe the following:
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
Thanks!

Try something like :
OverlapSplit <- function(x,nsplit=1,overlap=2){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
with nsplit the number of splits! (nsplit=1 returns 2 dataframes). This will render an incomplete last dataframe in case the overlap splits don't really fit in the dataframe, and issues a warning.
> OverlapSplit(DF,nsplit=3,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
[[2]]
x y
3 3 c
4 4 d
5 5 e
6 6 a
[[3]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
[[4]]
x y
7 7 b
8 8 c
9 9 d
10 10 e
And one with a warning
> OverlapSplit(DF,nsplit=1,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
NA NA <NA>
Warning message:
In OverlapSplit(DF, nsplit = 1, overlap = 1) :
Returning an incomplete dataframe.

This uses the shingle idea from Lattice graphics and so leverages code from package lattice to generate the intervals and then uses a loop to break the original DF into the correct subsets.
I wasn't exactly sure what is meant by overlap = 1 - I presume you meant overlap by 1 sample/observation. If so, the code below does this.
OverlapSplit <- function(x, nsplits = 1, overlap = 0) {
stopifnot(require(lattice))
N <- seq_len(nr <- nrow(x))
interv <- co.intervals(N, nsplits, overlap / nr)
out <- vector(mode = "list", length = nrow(interv))
for(i in seq_along(out)) {
out[[i]] <- x[interv[i,1] < N & N < interv[i,2], , drop = FALSE]
}
out
}
Which gives:
> OverlapSplit(DF, 2, 2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF, 4, 1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
3 3 c
4 4 d
5 5 e
[[3]]
x y
6 6 a
7 7 b
8 8 c
[[4]]
x y
8 8 c
9 9 d
10 10 e

Just to make it clear what I'm doing here:
#Load Libraries
library(PerformanceAnalytics)
library(quantmod)
#Function to Split Data Frame
OverlapSplit <- function(x,nsplit=1,overlap=0){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
#Function to run regression on 30 days to predict the next day
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
#Function to roll the regression
RollMyRegression <- function(data,ModelFUN,FL) {
rollapply(data, width=31,FUN=ModelFUN,FL,
by.column = FALSE, align = "right", na.pad = FALSE)
}
#Load Data
data(managers)
#Split Dataset
split.data <- OverlapSplit(managers,2,30)
sapply(split.data,dim)
#Run rolling regression on each split
output <- lapply(split.data,RollMyRegression,MyRegression,FL)
output
unlist(output)
In this manner, you can replace lapply at the end with a parallel version of lapply and increase your speed somewhat.
Of course, now there's the issue of optimizing the split/overlap, given you number of processors and the size of your dataset.

Related

Transforming columns every nth row to multiple rows

I have a data frame that I successfully separated every nth row of a column and arranged the snippets into rows:
list = data.frame(x = c("A", "1", "2", "3", "B", "1", "2", "3"))
x
1 A
2 1
3 2
4 3
5 B
6 1
7 2
8 3
to
w x y z
1 A 1 2 3
2 B 1 2 3
I managed to achieve this with:
table <- data.frame(matrix(list$x, ncol = 4, byrow = TRUE))
In the next step I would like to do this with multiple columns and place the resulting tables beneath each other.
From something like this:
x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0
to
w x y z
1 A 3 3 6
2 B 4 3 7
3 C 7 3 6
4 D 5 7 6
5 E 5 3 7
6 F 4 6 8
7 G 6 4 7
8 H 2 4 6
9 I 3 9 0
I am really stuck with this one.
If someone has an idea I would highly appreciate some help.
Making use of lapply and dplyr::bind_rows() this could be achieved like so:
library(dplyr)
df_list <- lapply(list, function(x) data.frame(matrix(x, ncol = 4, byrow = TRUE)))
df_list %>%
dplyr::bind_rows() %>%
setNames(c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
Or using do.call and rbind:
df_list <- do.call(rbind, c(df_list, list(make.row.names = FALSE)))
setNames(df_list, c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
DATA
list <- read.table(text = " x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0", header = TRUE)
Here is another solution. You could use good old base::by function in order to split a data set into groups and apply a function on each chunk. (Here is to dear #Henrik who taught me this valuable trick):
do.call(rbind, by(df, rep(seq_len(nrow(df)/4), each = 4), FUN = \(x) {
{setNames(as.data.frame(t(x[-4])), c("w", "x", "y", "z")) |>
`rownames<-`(NULL)}
})) |> `rownames<-`(NULL)
w x y z
1 A 3 3 6
2 D 5 7 7
3 G 6 4 5
4 B 4 3 7
5 E 5 3 7
6 H 2 4 6
7 C 7 3 6
8 F 4 6 8
9 I 3 9 0
Because the number of rows in each chunk seems to be known and constant, you may unlist the data frame, and use modulo (%%) to distinguish between the characters belonging to the "sub headers" and the numeric values.
v = unlist(d)
i = (1:nrow(d) - 1) %% 4 == 0
data.frame(w = v[i],
matrix(v[!i], ncol = 3, byrow = TRUE, dimnames = list(NULL, names(d))))
w x y z
x1 A 3 3 6
x5 B 4 3 7
x9 C 7 3 6
y1 D 5 7 7
y5 E 5 3 7
y9 F 4 6 8
z1 G 6 4 5
z5 H 2 4 6
z9 I 3 9 0
Wrap v[i] in as.integer or as.numeric if that's the desired class.
Add row.names = NULL to the data.frame call if you happen to care about row names.
Base R Split-Apply-Combine, a bit-ugly but is generic enough to handle data having more or less than 4 rows per matrix:
res <- data.frame(
do.call(
rbind,
lapply(
with(
lst,
split(
lst,
cumsum(
apply(
lst,
1,
function(row){
all(
grepl(
"[a-zA-Z]",
row
)
)
}
)
)
)
),
function(x){
type.convert(
setNames(
data.frame(
t(x),
stringsAsFactors = FALSE
),
letters[23:26]
)
)
}
)
),
row.names = NULL
)

How to add value into new column based on corresponding value in another column?

This is the sample data with 'y' being the new variable created.
x
A
B
C
y
A
1
4
7
B
5
6
7
C
3
5
3
If the value of column x ="A", I would like the value of col.A to be displayed in column y. And similarly for the "B" & "C" values in column x.
Final result should be something like this.
x
A
B
C
y
A
1
4
7
1
B
5
6
7
6
C
3
5
3
3
A proposition :
df <- read.table(header=TRUE, text="
x A B C
A 1 4 7
B 5 6 7
C 3 5 3
"
)
df$y <- paste0("df$",df$x,"[df$x=='",df$x,"']")
df
#> x A B C y
#> 1 A 1 4 7 df$A[df$x=='A']
#> 2 B 5 6 7 df$B[df$x=='B']
#> 3 C 3 5 3 df$C[df$x=='C']
df$y <- eval(ivmte:::unstring(df$y))
df
#> x A B C y
#> 1 A 1 4 7 1
#> 2 B 5 6 7 6
#> 3 C 3 5 3 3
# Created on 2021-01-30 by the reprex package (v0.3.0.9001)
Regards,
Try this:
create_column<-function(){
y<-numeric(nrow(your_dataframe))
for (i in 1:nrow(your_dataframe)){
y[i]<-your_dataframe[i, which(names(your_dataframe)==your_dataframe$x[i])]
}
cbind(your_dataframe, y)
}
create_column()
x A B C y
1 A 1 4 7 1
2 B 5 6 7 6
3 C 3 5 3 3
>
another option with apply:
cbind(your_dataframe, y=apply(your_dataframe, 1, function(x){
x[which(names(x)==x['x'])]
}))
> your_dataframe
x A B C y
1 A 1 4 7 1
2 B 5 6 7 6
3 C 3 5 3 3
Try this
df$y <- df[-1][cbind(seq(nrow(df)),match(df$x,names(df)[-1]))]

Look up value based on values in a series of two other columns

I want to look up a value in column VAR_1, based on the values of 'ID_1' and `ID_2'.
set.seed(1234)
df <- data.frame(ID_1 = sample(1:10, 10),
ID_2 = sample(1:10, 10),
VAR_1 = sample(LETTERS[1:10], 10))
df
ID_1 ID_2 VAR_1
1 2 7 D
2 6 5 C
3 5 3 B
4 8 10 A
5 9 2 H
6 4 9 E
7 1 6 I
8 7 1 J
9 10 8 F
10 3 4 G
Look at the number in ID_2, find the matching number in ID_1 and record VAR_1 from that row in a new column RES
What the result should look like:
ID_1 ID_2 VAR_1 RES
1 2 7 D J
2 6 5 C B
3 5 3 B G
4 8 10 A F
5 9 2 H D
6 4 9 E H
7 1 6 I C
8 7 1 J I
9 10 8 F A
10 3 4 G E
I have tried using which, a for-loop and if_else statements all to no avail, I'm sure it must have a simple solution. Any help much appreciated.
You need ?match -
df$RES <- with(df, VAR_1[match(ID_2, ID_1)])
ID_1 ID_2 VAR_1 RES
1 2 7 D J
2 6 5 C B
3 5 3 B G
4 8 10 A F
5 9 2 H D
6 4 9 E H
7 1 6 I C
8 7 1 J I
9 10 8 F A
10 3 4 G E

how to delete dataframe's row with 3 of 5 equal element?

I have a dataframe with 5 columns and many many rows, that have repetition of elements only for the first 3 columns (in short, it is a volume built by several volumes, and so there are same coordinates (x,y,z) with different labels, and I would like to eliminate the repeated coordinates).
How can I eliminate these with R commands?
Thanks
AV
You can use duplicated function, e.g. :
# create an example data.frame
Lab1<-letters[1:10]
Lab2<-LETTERS[1:10]
x <- c(3,4,3,3,4,2,4,3,9,0)
y <- c(3,4,3,5,4,2,1,5,7,2)
z <- c(8,7,8,8,4,3,1,8,6,3)
DF <- data.frame(Lab1,Lab2,x,y,z)
> DF
Lab1 Lab2 x y z
1 a A 3 3 8
2 b B 4 4 7
3 c C 3 3 8
4 d D 3 5 8
5 e E 4 4 4
6 f F 2 2 3
7 g G 4 1 1
8 h H 3 5 8
9 i I 9 7 6
10 j J 0 2 3
# remove rows having repeated x,y,z
DF2 <- DF[!duplicated(DF[,c('x','y','z')]),]
> DF2
Lab1 Lab2 x y z
1 a A 3 3 8
2 b B 4 4 7
4 d D 3 5 8
5 e E 4 4 4
6 f F 2 2 3
7 g G 4 1 1
9 i I 9 7 6
10 j J 0 2 3
EDIT :
To allow choosing amongst the rows having the same coordinates, you can use for example by function (even if is less efficient then previous approach) :
res <- by(DF,
INDICES=paste(DF$x,DF$y,DF$z,sep='|'),
FUN=function(equalRows){
# equalRows is a data.frame with the rows having the same x,y,z
# for exampel here we choose the first row ordering by Lab1 then Lab2
row <- equalRows[order(equalRows$Lab1,equalRows$Lab2),][1,]
return(row)
})
DF2 <- do.call(rbind.data.frame,res)
> DF2
Lab1 Lab2 x y z
0|2|3 j J 0 2 3
2|2|3 f F 2 2 3
3|3|8 a A 3 3 8
3|5|8 d D 3 5 8
4|1|1 g G 4 1 1
4|4|4 e E 4 4 4
4|4|7 b B 4 4 7
9|7|6 i I 9 7 6

How to merge two datasets by the different values in R?

I have two datasets and want to merge them. How I add to first dataset only the lines that are in the second that are not in the first?
Only add to final dataset if the value not exists in the another dataset. An example dataset:
x = data.frame(id = c("a","c","d","g"),
value = c(1,3,4,7))
y = data.frame(id = c("b","c","d","e","f"),
value = c(5,6,8,9,7))
The merged dataset should look like (the order is not important):
a 1
b 5
c 3
d 4
e 9
f 7
g 7
Using !, %in% and rbind:
rbind(x[!x$id %in% y$id,], y)
id value
1 a 1
4 g 7
3 b 2
41 c 3
5 d 4
6 e 5
7 f 6
For your example to work, you first need to ensure that id in each data.frame are directly comparable. Since they're factors, you need ensure they have the same levels/labels; or you can just convert them to character.
# convert factors to character
x$id <- as.character(x$id)
y$id <- as.character(y$id)
# merge
z <- merge(x,y,by="id",all=TRUE)
# keep first value, if it exists
z$value <- ifelse(is.na(z$value.x),z$value.y,z$value.x)
# keep desired columns
z <- z[,c("id","value")]
z
# id value
# 1 a 1
# 2 b 5
# 3 c 3
# 4 d 4
# 5 e 9
# 6 f 7
# 7 g 7
You already answered your own question, but just didn't realize it right away. :)
> merge(x,y,all=TRUE)
id value
1 a 1
2 c 3
3 c 6
4 d 4
5 d 8
6 g 7
7 b 5
8 e 9
9 f 7
EDIT
I'm a bit dense here and I'm not sure where you're getting at, so I provide you with a shotgun approach. What I did was I merged the data.frames by id and copied values from x to y if y` was missing. Take whichever column you need.
> x = data.frame(id = c("a","c","d","g"),
+ value = c(1,3,4,7))
> y = data.frame(id = c("b","c","d","e","f"),
+ value = c(5,6,8,9,7))
> xy <- merge(x, y, by = "id", all = TRUE)
> xy
id value.x value.y
1 a 1 NA
2 c 3 6
3 d 4 8
4 g 7 NA
5 b NA 5
6 e NA 9
7 f NA 7
> find.na <- is.na(xy[, "value.y"])
> xy$new.col <- xy[, "value.y"]
> xy[find.na, "new.col"] <- xy[find.na, "value.x"]
> xy
id value.x value.y new.col
1 a 1 NA 1
2 c 3 6 6
3 d 4 8 8
4 g 7 NA 7
5 b NA 5 5
6 e NA 9 9
7 f NA 7 7
> xy[order(as.character(xy$id)), ]
id value.x value.y new.col
1 a 1 NA 1
5 b NA 5 5
2 c 3 6 6
3 d 4 8 8
6 e NA 9 9
7 f NA 7 7
4 g 7 NA 7

Resources