Related
I have a data frame that I successfully separated every nth row of a column and arranged the snippets into rows:
list = data.frame(x = c("A", "1", "2", "3", "B", "1", "2", "3"))
x
1 A
2 1
3 2
4 3
5 B
6 1
7 2
8 3
to
w x y z
1 A 1 2 3
2 B 1 2 3
I managed to achieve this with:
table <- data.frame(matrix(list$x, ncol = 4, byrow = TRUE))
In the next step I would like to do this with multiple columns and place the resulting tables beneath each other.
From something like this:
x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0
to
w x y z
1 A 3 3 6
2 B 4 3 7
3 C 7 3 6
4 D 5 7 6
5 E 5 3 7
6 F 4 6 8
7 G 6 4 7
8 H 2 4 6
9 I 3 9 0
I am really stuck with this one.
If someone has an idea I would highly appreciate some help.
Making use of lapply and dplyr::bind_rows() this could be achieved like so:
library(dplyr)
df_list <- lapply(list, function(x) data.frame(matrix(x, ncol = 4, byrow = TRUE)))
df_list %>%
dplyr::bind_rows() %>%
setNames(c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
Or using do.call and rbind:
df_list <- do.call(rbind, c(df_list, list(make.row.names = FALSE)))
setNames(df_list, c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
DATA
list <- read.table(text = " x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0", header = TRUE)
Here is another solution. You could use good old base::by function in order to split a data set into groups and apply a function on each chunk. (Here is to dear #Henrik who taught me this valuable trick):
do.call(rbind, by(df, rep(seq_len(nrow(df)/4), each = 4), FUN = \(x) {
{setNames(as.data.frame(t(x[-4])), c("w", "x", "y", "z")) |>
`rownames<-`(NULL)}
})) |> `rownames<-`(NULL)
w x y z
1 A 3 3 6
2 D 5 7 7
3 G 6 4 5
4 B 4 3 7
5 E 5 3 7
6 H 2 4 6
7 C 7 3 6
8 F 4 6 8
9 I 3 9 0
Because the number of rows in each chunk seems to be known and constant, you may unlist the data frame, and use modulo (%%) to distinguish between the characters belonging to the "sub headers" and the numeric values.
v = unlist(d)
i = (1:nrow(d) - 1) %% 4 == 0
data.frame(w = v[i],
matrix(v[!i], ncol = 3, byrow = TRUE, dimnames = list(NULL, names(d))))
w x y z
x1 A 3 3 6
x5 B 4 3 7
x9 C 7 3 6
y1 D 5 7 7
y5 E 5 3 7
y9 F 4 6 8
z1 G 6 4 5
z5 H 2 4 6
z9 I 3 9 0
Wrap v[i] in as.integer or as.numeric if that's the desired class.
Add row.names = NULL to the data.frame call if you happen to care about row names.
Base R Split-Apply-Combine, a bit-ugly but is generic enough to handle data having more or less than 4 rows per matrix:
res <- data.frame(
do.call(
rbind,
lapply(
with(
lst,
split(
lst,
cumsum(
apply(
lst,
1,
function(row){
all(
grepl(
"[a-zA-Z]",
row
)
)
}
)
)
)
),
function(x){
type.convert(
setNames(
data.frame(
t(x),
stringsAsFactors = FALSE
),
letters[23:26]
)
)
}
)
),
row.names = NULL
)
SO. The following might serve as a small example of the real list.
a <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
b <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
c <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
l <- list(a,b,c)
From the second column to last column - on every data frame - i want to add the sums as a new column to each data frame.
I tried:
lapply(l, function(x) rowSums(x[2:ncol(x)]))
which returns the correct sums, but doesn't add them to the data frames.
I also tried:
lapply(l, transform, sum = y + z)
which gives me the correct results but is not flexible enough, because i don't always know how many columns there are for each data frame and what names they have. The only thing i know, is, that i have to start from second column to end. I tried to combine these two approaches but i can't figure out, how to do it exactly.
Thanks
Try this. You can play around index in columns and exclude the first variable so that there is not issues about how many additional variables you have in order to obtain the rowsums. Here the code:
#Compute rowsums
l1 <- lapply(l,function(x) {x$RowSum<-rowSums(x[,-1],na.rm=T);return(x)})
Output:
l1
[[1]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
Here's how to combine your attempts. I used data[-1] instead of data[2:ncol(data)] because it seems simpler, but either should work.
lapply(l, function(data) transform(data, sum = rowSums(data[-1])))
Unfortunately, transform will be confused if the name of the argument to your anonymous function is the same as a column name - data[-1] needs to look at the data frame, not a particular column. (I originally use function(x) instead of function(data), and this caused an error because there is a column named x. From this perspective, Duck's answer is a little safer.)
Does this work:
> add_col <- function(df){
+ df[(ncol(df)+1)] = rowSums(df[2:ncol(df)])
+ df
+ }
> lapply(l, add_col)
[[1]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
>
With sum as column name:
> add_col <- function(df){
+ df['sum'] = rowSums(df[2:ncol(df)])
+ df
+ }
> lapply(l, add_col)
[[1]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
use tidyverse
library(tidyverse)
map(l, ~.x %>% mutate(Sum := apply(.x[-1], 1, sum)))
#> [[1]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
#>
#> [[2]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
#>
#> [[3]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
Created on 2020-09-30 by the reprex package (v0.3.0)
We can use map with mutate
library(purrr)
library(dplyr)
map(l, ~ .x %>%
mutate(sum = rowSums(select(., -1))))
Or with c_across
map(l, ~ .x %>%
rowwise() %>%
mutate(sum = sum(c_across(-1), na.rm = TRUE)) %>%
ungroup)
Assuming we have the following simplified vectors (in reality, they contain much more values):
n <- c(1,2)
x <- c(4,5,6)
y <- c(7,8,9)
#to get all possible combinations, we can use expand.grid
df <- expand.grid(n=n,
x=x,
y=y
)
> df
n x y
1 4 7
2 4 7
1 5 7
2 5 7
1 6 7
2 6 7
1 4 8
2 4 8
1 5 8
2 5 8
1 6 8
2 6 8
1 4 9
2 4 9
1 5 9
2 5 9
1 6 9
2 6 9
However, I would like vectors x, y to have the combination where only elements with the same index values are considered, i.e. (x1, y1), (x2, y2), (x3, y3) but NOT (x1,y2), (x1,y3), etc.
while vector n is still used as usual (all its elements are 'paired' with the outcome of x and y combination).
In other words, I would like to get the following df:
n x y
1 4 7
2 4 7
1 5 8
2 5 8
1 6 9
2 6 9
if n vector had 3 elements, i.e. n <- (1, 2, 3), then we would have:
n x y
1 4 7
2 4 7
3 4 7
1 5 8
2 5 8
3 5 8
1 6 9
2 6 9
3 6 9
You could combine list of pairs that need to be together and then use it in expand.grid
expand.grid(n, Map(c, x, y)) %>% tidyr::unnest_wider(Var2)
Or we can also use crossing using the same logic.
library(tidyverse)
crossing(n, x = map2(x, y, c)) %>%
unnest_wider(x) %>%
rename_at(-1, ~c("x", "y"))
# n x y
# <dbl> <dbl> <dbl>
#1 1 4 7
#2 1 5 8
#3 1 6 9
#4 2 4 7
#5 2 5 8
#6 2 6 9
We can create a function to do this
f1 <- function(vec1, vec2, n) {
d1 <- data.frame(x = vec1, y = vec2)
d2 <- transform(d1[rep(seq_len(nrow(d1)), each = length(n)), ], n = n)
row.names(d2) <- NULL
d2[c('n', 'x', 'y')]
}
f1(x, y, n = 1:2)
# n x y
#1 1 4 7
#2 2 4 7
#3 1 5 8
#4 2 5 8
#5 1 6 9
#6 2 6 9
f1(x, y, n = 1:3)
# n x y
#1 1 4 7
#2 2 4 7
#3 3 4 7
#4 1 5 8
#5 2 5 8
#6 3 5 8
#7 1 6 9
#8 2 6 9
#9 3 6 9
Or in tidyverse
library(dplyr)
library(tidyr)
tibble(x, y) %>%
uncount(length(n)) %>%
mutate(n = rep(n, length.out = n())) %>%
select(n, x, y)
# A tibble: 9 x 3
# n x y
# <int> <dbl> <dbl>
#1 1 4 7
#2 2 4 7
#3 3 4 7
#4 1 5 8
#5 2 5 8
#6 3 5 8
#7 1 6 9
#8 2 6 9
#9 3 6 9
Or create a tibble first and then use that with crossing
tibble(x, y) %>%
crossing(n)
data
n <- 1:3
Here's a tidyverse solution, using purrr::map_df:
library(tidyverse)
map_df(n, ~tibble(n=.x, x, y))
n x y
<dbl> <dbl> <dbl>
1 1 4 7
2 1 5 8
3 1 6 9
4 2 4 7
5 2 5 8
6 2 6 9
If you need the values sorted exactly like your example output, add %>% arrange(x, y) to the output of map.
One option would be to paste together x and y, then use expand grid and separate the columns using the separate function from the tidyr package.
library(dplyr) #for pipe
library(tidyr) #for separate
n <- c(1,2)
x <- c(4,5,6)
y <- c(7,8,9)
z <- paste(x, y, sep = "-")
expand.grid(n = n, xy = z) %>%
separate(xy, sep = "-", into = c("x", "y")) %>%
mutate(x = as.numeric(x), y = as.numeric(y)) %>%
as.tibble()
I have a list containing two matrices:
a <- list("m1"=matrix(1:9, nrow = 3, ncol = 3),
"m2"=matrix(1:9, nrow = 3, ncol = 3))
I want to bind the two matrices (row-bind), and to distinguish the rows, I want to add a column that contains the name of the matrix. I can bind the rows using r bind:
b <- do.call(rbind, a) %>% as.data.frame
which yields
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
4 1 4 7
5 2 5 8
6 3 6 9
But how do I add a column containing the names? I can do b$id <- c("m1","m1","m1","m2","m2","m2"), but there must be an easier way than this (?)
Here's how to do it in dplyr / purrr
a %>% purrr::map(as.data.frame) %>% dplyr::bind_rows(.id = "origin")
origin V1 V2 V3
1 m1 1 4 7
2 m1 2 5 8
3 m1 3 6 9
4 m2 1 4 7
5 m2 2 5 8
6 m2 3 6 9
That converts the matrices to data-frames before row-binding them.
You can use bind_rows on a list of matrices. But it doesn't return what you expect.
a %>% bind_rows(.id = "origin")
# A tibble: 9 x 3
origin m1 m2
<chr> <int> <int>
1 1 1 1
2 1 2 2
3 1 3 3
4 1 4 4
5 1 5 5
6 1 6 6
7 1 7 7
8 1 8 8
9 1 9 9
This happens because m1 and m2 are vectors (because they are matrices) of the same length, and bind_rows sees a list of constant-length vectors as a single data-frame. So the latter call is equivalent to
bind_rows(data.frame(m1 = as.vector(m1), m2 = as.vector(m2)), .id = "origin")
So, make sure you convert your matrices to data.frames before you bind them together.
You can do:
b <- do.call(rbind.data.frame, a)
# V1 V2 V3
#m1.1 1 4 7
#m1.2 2 5 8
#m1.3 3 6 9
#m2.1 1 4 7
#m2.2 2 5 8
#m2.3 3 6 9
or if you not happy with this,
b <- do.call(rbind.data.frame, a)
b$id <- sub("[.].+", "", rownames(b))
# V1 V2 V3 id
#m1.1 1 4 7 m1
#m1.2 2 5 8 m1
#m1.3 3 6 9 m1
#m2.1 1 4 7 m2
#m2.2 2 5 8 m2
#m2.3 3 6 9 m2
I'm trying to write a function that behaves as follows, but it is proving very difficult:
DF <- data.frame(x = seq(1,10), y = rep(c('a','b','c','d','e'),2))
> DF
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
>OverLapSplit(DF,nsplits=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=4)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
[[2]]
x y
1 4 e
2 5 a
3 6 b
4 7 c
5 8 d
6 9 e
7 10 a
>OverLapSplit(DF,nsplits=5,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
1 3 c
2 4 d
3 5 e
[[3]]
x y
1 5 e
2 6 a
3 7 b
[[4]]
x y
1 7 b
2 8 c
3 9 d
[[5]]
x y
1 8 d
2 9 e
3 10 f
I haven't thought a lot about what would happen if you tried something like OverLapSplit(DF,nsplits=2,overlap=1)
Maybe the following:
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
Thanks!
Try something like :
OverlapSplit <- function(x,nsplit=1,overlap=2){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
with nsplit the number of splits! (nsplit=1 returns 2 dataframes). This will render an incomplete last dataframe in case the overlap splits don't really fit in the dataframe, and issues a warning.
> OverlapSplit(DF,nsplit=3,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
[[2]]
x y
3 3 c
4 4 d
5 5 e
6 6 a
[[3]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
[[4]]
x y
7 7 b
8 8 c
9 9 d
10 10 e
And one with a warning
> OverlapSplit(DF,nsplit=1,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
NA NA <NA>
Warning message:
In OverlapSplit(DF, nsplit = 1, overlap = 1) :
Returning an incomplete dataframe.
This uses the shingle idea from Lattice graphics and so leverages code from package lattice to generate the intervals and then uses a loop to break the original DF into the correct subsets.
I wasn't exactly sure what is meant by overlap = 1 - I presume you meant overlap by 1 sample/observation. If so, the code below does this.
OverlapSplit <- function(x, nsplits = 1, overlap = 0) {
stopifnot(require(lattice))
N <- seq_len(nr <- nrow(x))
interv <- co.intervals(N, nsplits, overlap / nr)
out <- vector(mode = "list", length = nrow(interv))
for(i in seq_along(out)) {
out[[i]] <- x[interv[i,1] < N & N < interv[i,2], , drop = FALSE]
}
out
}
Which gives:
> OverlapSplit(DF, 2, 2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF, 4, 1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
3 3 c
4 4 d
5 5 e
[[3]]
x y
6 6 a
7 7 b
8 8 c
[[4]]
x y
8 8 c
9 9 d
10 10 e
Just to make it clear what I'm doing here:
#Load Libraries
library(PerformanceAnalytics)
library(quantmod)
#Function to Split Data Frame
OverlapSplit <- function(x,nsplit=1,overlap=0){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
#Function to run regression on 30 days to predict the next day
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
#Function to roll the regression
RollMyRegression <- function(data,ModelFUN,FL) {
rollapply(data, width=31,FUN=ModelFUN,FL,
by.column = FALSE, align = "right", na.pad = FALSE)
}
#Load Data
data(managers)
#Split Dataset
split.data <- OverlapSplit(managers,2,30)
sapply(split.data,dim)
#Run rolling regression on each split
output <- lapply(split.data,RollMyRegression,MyRegression,FL)
output
unlist(output)
In this manner, you can replace lapply at the end with a parallel version of lapply and increase your speed somewhat.
Of course, now there's the issue of optimizing the split/overlap, given you number of processors and the size of your dataset.