Transforming columns every nth row to multiple rows - r

I have a data frame that I successfully separated every nth row of a column and arranged the snippets into rows:
list = data.frame(x = c("A", "1", "2", "3", "B", "1", "2", "3"))
x
1 A
2 1
3 2
4 3
5 B
6 1
7 2
8 3
to
w x y z
1 A 1 2 3
2 B 1 2 3
I managed to achieve this with:
table <- data.frame(matrix(list$x, ncol = 4, byrow = TRUE))
In the next step I would like to do this with multiple columns and place the resulting tables beneath each other.
From something like this:
x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0
to
w x y z
1 A 3 3 6
2 B 4 3 7
3 C 7 3 6
4 D 5 7 6
5 E 5 3 7
6 F 4 6 8
7 G 6 4 7
8 H 2 4 6
9 I 3 9 0
I am really stuck with this one.
If someone has an idea I would highly appreciate some help.

Making use of lapply and dplyr::bind_rows() this could be achieved like so:
library(dplyr)
df_list <- lapply(list, function(x) data.frame(matrix(x, ncol = 4, byrow = TRUE)))
df_list %>%
dplyr::bind_rows() %>%
setNames(c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
Or using do.call and rbind:
df_list <- do.call(rbind, c(df_list, list(make.row.names = FALSE)))
setNames(df_list, c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
DATA
list <- read.table(text = " x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0", header = TRUE)

Here is another solution. You could use good old base::by function in order to split a data set into groups and apply a function on each chunk. (Here is to dear #Henrik who taught me this valuable trick):
do.call(rbind, by(df, rep(seq_len(nrow(df)/4), each = 4), FUN = \(x) {
{setNames(as.data.frame(t(x[-4])), c("w", "x", "y", "z")) |>
`rownames<-`(NULL)}
})) |> `rownames<-`(NULL)
w x y z
1 A 3 3 6
2 D 5 7 7
3 G 6 4 5
4 B 4 3 7
5 E 5 3 7
6 H 2 4 6
7 C 7 3 6
8 F 4 6 8
9 I 3 9 0

Because the number of rows in each chunk seems to be known and constant, you may unlist the data frame, and use modulo (%%) to distinguish between the characters belonging to the "sub headers" and the numeric values.
v = unlist(d)
i = (1:nrow(d) - 1) %% 4 == 0
data.frame(w = v[i],
matrix(v[!i], ncol = 3, byrow = TRUE, dimnames = list(NULL, names(d))))
w x y z
x1 A 3 3 6
x5 B 4 3 7
x9 C 7 3 6
y1 D 5 7 7
y5 E 5 3 7
y9 F 4 6 8
z1 G 6 4 5
z5 H 2 4 6
z9 I 3 9 0
Wrap v[i] in as.integer or as.numeric if that's the desired class.
Add row.names = NULL to the data.frame call if you happen to care about row names.

Base R Split-Apply-Combine, a bit-ugly but is generic enough to handle data having more or less than 4 rows per matrix:
res <- data.frame(
do.call(
rbind,
lapply(
with(
lst,
split(
lst,
cumsum(
apply(
lst,
1,
function(row){
all(
grepl(
"[a-zA-Z]",
row
)
)
}
)
)
)
),
function(x){
type.convert(
setNames(
data.frame(
t(x),
stringsAsFactors = FALSE
),
letters[23:26]
)
)
}
)
),
row.names = NULL
)

Related

How to add value into new column based on corresponding value in another column?

This is the sample data with 'y' being the new variable created.
x
A
B
C
y
A
1
4
7
B
5
6
7
C
3
5
3
If the value of column x ="A", I would like the value of col.A to be displayed in column y. And similarly for the "B" & "C" values in column x.
Final result should be something like this.
x
A
B
C
y
A
1
4
7
1
B
5
6
7
6
C
3
5
3
3
A proposition :
df <- read.table(header=TRUE, text="
x A B C
A 1 4 7
B 5 6 7
C 3 5 3
"
)
df$y <- paste0("df$",df$x,"[df$x=='",df$x,"']")
df
#> x A B C y
#> 1 A 1 4 7 df$A[df$x=='A']
#> 2 B 5 6 7 df$B[df$x=='B']
#> 3 C 3 5 3 df$C[df$x=='C']
df$y <- eval(ivmte:::unstring(df$y))
df
#> x A B C y
#> 1 A 1 4 7 1
#> 2 B 5 6 7 6
#> 3 C 3 5 3 3
# Created on 2021-01-30 by the reprex package (v0.3.0.9001)
Regards,
Try this:
create_column<-function(){
y<-numeric(nrow(your_dataframe))
for (i in 1:nrow(your_dataframe)){
y[i]<-your_dataframe[i, which(names(your_dataframe)==your_dataframe$x[i])]
}
cbind(your_dataframe, y)
}
create_column()
x A B C y
1 A 1 4 7 1
2 B 5 6 7 6
3 C 3 5 3 3
>
another option with apply:
cbind(your_dataframe, y=apply(your_dataframe, 1, function(x){
x[which(names(x)==x['x'])]
}))
> your_dataframe
x A B C y
1 A 1 4 7 1
2 B 5 6 7 6
3 C 3 5 3 3
Try this
df$y <- df[-1][cbind(seq(nrow(df)),match(df$x,names(df)[-1]))]

Converting a list of list of data.frames to a single data.frame in R

I have a list of list of data.frames (see L below).
I was wondering if it might be possible to convert L to my desired output shown below which is a single data.frame?
L <- list(A = list(Short = data.frame(d = 1:2, SD = 3:4)),
B = list(Short = data.frame(d = 2:3, SD = 1:2), Long1 = data.frame(d = 7:8, SD = 6:7)),
C = list(Short = data.frame(d = 5:6, SD = 3:4), Long1 = data.frame(d = 8:9, SD = 1:2),
Long2 = data.frame(d = 4:5, SD = 6:7)))
Desired output (a data.frame):
d SD id
1 3 1
2 4 1
2 1 2
3 2 2
7 6 2
8 7 2
5 3 3
6 4 3
8 1 3
9 2 3
4 6 3
5 7 3
We could try rbinding every list in L and adding a new column which would denote the list number and finally bring the entire list into one dataframe using do.call and rbind.
output <- do.call(rbind, lapply(seq_along(L), function(x)
transform(do.call(rbind, L[[x]]), id = x)))
rownames(output) <- NULL
output
# d SD id
#1 1 3 1
#2 2 4 1
#3 2 1 2
#4 3 2 2
#5 7 6 2
#6 8 7 2
#7 5 3 3
#8 6 4 3
#9 8 1 3
#10 9 2 3
#11 4 6 3
#12 5 7 3
It might be a bit shorter using dplyr's bind_rows with purrr::map but this gives id variable as name of the list(A, B, C) instead of sequence which should not be difficult to change.
library(dplyr)
bind_rows(purrr::map(L, bind_rows), .id = "id") %>%
mutate(id = match(id, unique(id)))
We can use lapply/Map in base R. We can loop through the list with lapply, rbind the nested list elements, then create a new column with Map and rbind the outer list elements
out <- do.call(rbind, Map(cbind, lapply(L, function(x)
do.call(rbind, x)), id = seq_along(L)))
row.names(out) <- NULL
out
# d SD id
#1 1 3 1
#2 2 4 1
#3 2 1 2
#4 3 2 2
#5 7 6 2
#6 8 7 2
#7 5 3 3
#8 6 4 3
#9 8 1 3
#10 9 2 3
#11 4 6 3
#12 5 7 3
Based on the comments, if we need to add another column from the names of the inner list
out1 <- do.call(rbind, Map(cbind, lapply(L, function(dat)
do.call(rbind, Map(cbind, dat, es.type = names(dat)))), id = seq_along(L)))
row.names(out1) <- NULL
out1
# d SD es.type id
#1 1 3 Short 1
#2 2 4 Short 1
#3 2 1 Short 2
#4 3 2 Short 2
#5 7 6 Long1 2
#6 8 7 Long1 2
#7 5 3 Short 3
#8 6 4 Short 3
#9 8 1 Long1 3
#10 9 2 Long1 3
#11 4 6 Long2 3
#12 5 7 Long2 3
If there are ..\\d+ and want to remove
out1 <- do.call(rbind, Map(cbind, lapply(L, function(dat)
do.call(rbind, Map(cbind, dat,
es.type = sub("\\.*\\d+$", "", names(dat))))), id = seq_along(L)))
row.names(out1) <- NULL
out1
# d SD es.type id
#1 1 3 Short 1
#2 2 4 Short 1
#3 2 1 Short 2
#4 3 2 Short 2
#5 7 6 Long 2
#6 8 7 Long 2
#7 5 3 Short 3
#8 6 4 Short 3
#9 8 1 Long 3
#10 9 2 Long 3
#11 4 6 Long 3
#12 5 7 Long 3
Here is another possible approach using purrr's flatten_dfr:
library(purrr)
transform(flatten_dfr(L), id = rep(seq_along(L), times = map(L, ~sum(lengths(.x)))))
#> d SD id
#> 1 1 3 1
#> 2 2 4 1
#> 3 2 1 2
#> 4 3 2 2
#> 5 7 6 2
#> 6 8 7 2
#> 7 5 3 3
#> 8 6 4 3
#> 9 8 1 3
#> 10 9 2 3
#> 11 4 6 3
#> 12 5 7 3
NB: here I used base R's transform which could be replaced by dplyr's mutate
rbindlist() is a convenience function which makes one data.table from a list of many. For this nested list it has to be applied twice recursively.
In addition, it has the idcol parameter which creates a column in the result showing which list item those rows came from.
library(data.table)
rbindlist(lapply(L, rbindlist, idcol = "es.type"), idcol = "id")
id es.type d SD
1: A Short 1 3
2: A Short 2 4
3: B Short 2 1
4: B Short 3 2
5: B Long1 7 6
6: B Long1 8 7
7: C Short 5 3
8: C Short 6 4
9: C Long1 8 1
10: C Long1 9 2
11: C Long2 4 6
12: C Long2 5 7
Now, the OP has requested that id is numeric and that Long1 and Long2 must become Long. This can be achieved by subsequent operations on the result columns:
rbindlist(lapply(L, rbindlist, idcol = "es.type"), idcol = "id")[
, id := rleid(id)][
, es.type := sub("\\d+$", "", es.type)][]
id es.type d SD
1: 1 Short 1 3
2: 1 Short 2 4
3: 2 Short 2 1
4: 2 Short 3 2
5: 2 Long 7 6
6: 2 Long 8 7
7: 3 Short 5 3
8: 3 Short 6 4
9: 3 Long 8 1
10: 3 Long 9 2
11: 3 Long 4 6
12: 3 Long 5 7
In base R, we can achieve the same by
do.call("rbind", lapply(L, do.call, what = "rbind"))
which returns
d SD
A.Short.1 1 3
A.Short.2 2 4
B.Short.1 2 1
B.Short.2 3 2
B.Long1.1 7 6
B.Long1.2 8 7
C.Short.1 5 3
C.Short.2 6 4
C.Long1.1 8 1
C.Long1.2 9 2
C.Long2.1 4 6
C.Long2.2 5 7
id and es.type can be retrieved from parsing the row names, e.g.,
DF <- do.call("rbind", lapply(L, do.call, what = "rbind"))
id <- stringr::str_extract(row.names(DF), "^[^.]*")
# create sequence number (that's what data.table::rleid() does)
DF$id <- c(1L, cumsum(head(id, -1L) != tail(id, -1L)) + 1L)
DF$es.type <- stringr::str_extract(row.names(DF), "(?<=\\.)[^.0-9]*")
row.names(DF) <- NULL
DF
d SD id es.type
1 1 3 1 Short
2 2 4 1 Short
3 2 1 2 Short
4 3 2 2 Short
5 7 6 2 Long
6 8 7 2 Long
7 5 3 3 Short
8 6 4 3 Short
9 8 1 3 Long
10 9 2 3 Long
11 4 6 3 Long
12 5 7 3 Long

R how to subset a list use another list

I have two lists, and I want to subset listA by using listB.
Let's say I have listA and ListB, I want listC.
listA <- list(a = data.frame(x = 1:5, y = 6:10),
b = data.frame(x = 4:8, y = 7:11))
> listA
$a
x y
1 1 6
2 2 7
3 3 8
4 4 9
5 5 10
$b
x y
1 4 7
2 5 8
3 6 9
4 7 10
5 8 11
listB <- list(a = c(3,5), b = c(4, 7))
I want listC should be:
> listC
$a
x y
3 3 8
4 4 9
5 5 10
$b
x y
1 4 7
2 5 8
3 6 9
4 7 10
I appreciate any help!
It sounds like you need to use mapply. Try this:
fun <- function(df, sq) df[df$x %in% seq(sq[1], sq[2]), ]
listC <- mapply(fun, listA, listB, SIMPLIFY = FALSE)
listC
This gives
> listC
$a
x y
3 3 8
4 4 9
5 5 10
$b
x y
1 4 7
2 5 8
3 6 9
4 7 10

How to merge two datasets by the different values in R?

I have two datasets and want to merge them. How I add to first dataset only the lines that are in the second that are not in the first?
Only add to final dataset if the value not exists in the another dataset. An example dataset:
x = data.frame(id = c("a","c","d","g"),
value = c(1,3,4,7))
y = data.frame(id = c("b","c","d","e","f"),
value = c(5,6,8,9,7))
The merged dataset should look like (the order is not important):
a 1
b 5
c 3
d 4
e 9
f 7
g 7
Using !, %in% and rbind:
rbind(x[!x$id %in% y$id,], y)
id value
1 a 1
4 g 7
3 b 2
41 c 3
5 d 4
6 e 5
7 f 6
For your example to work, you first need to ensure that id in each data.frame are directly comparable. Since they're factors, you need ensure they have the same levels/labels; or you can just convert them to character.
# convert factors to character
x$id <- as.character(x$id)
y$id <- as.character(y$id)
# merge
z <- merge(x,y,by="id",all=TRUE)
# keep first value, if it exists
z$value <- ifelse(is.na(z$value.x),z$value.y,z$value.x)
# keep desired columns
z <- z[,c("id","value")]
z
# id value
# 1 a 1
# 2 b 5
# 3 c 3
# 4 d 4
# 5 e 9
# 6 f 7
# 7 g 7
You already answered your own question, but just didn't realize it right away. :)
> merge(x,y,all=TRUE)
id value
1 a 1
2 c 3
3 c 6
4 d 4
5 d 8
6 g 7
7 b 5
8 e 9
9 f 7
EDIT
I'm a bit dense here and I'm not sure where you're getting at, so I provide you with a shotgun approach. What I did was I merged the data.frames by id and copied values from x to y if y` was missing. Take whichever column you need.
> x = data.frame(id = c("a","c","d","g"),
+ value = c(1,3,4,7))
> y = data.frame(id = c("b","c","d","e","f"),
+ value = c(5,6,8,9,7))
> xy <- merge(x, y, by = "id", all = TRUE)
> xy
id value.x value.y
1 a 1 NA
2 c 3 6
3 d 4 8
4 g 7 NA
5 b NA 5
6 e NA 9
7 f NA 7
> find.na <- is.na(xy[, "value.y"])
> xy$new.col <- xy[, "value.y"]
> xy[find.na, "new.col"] <- xy[find.na, "value.x"]
> xy
id value.x value.y new.col
1 a 1 NA 1
2 c 3 6 6
3 d 4 8 8
4 g 7 NA 7
5 b NA 5 5
6 e NA 9 9
7 f NA 7 7
> xy[order(as.character(xy$id)), ]
id value.x value.y new.col
1 a 1 NA 1
5 b NA 5 5
2 c 3 6 6
3 d 4 8 8
6 e NA 9 9
7 f NA 7 7
4 g 7 NA 7

Split a data frame into overlapping dataframes

I'm trying to write a function that behaves as follows, but it is proving very difficult:
DF <- data.frame(x = seq(1,10), y = rep(c('a','b','c','d','e'),2))
> DF
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
>OverLapSplit(DF,nsplits=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=4)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
[[2]]
x y
1 4 e
2 5 a
3 6 b
4 7 c
5 8 d
6 9 e
7 10 a
>OverLapSplit(DF,nsplits=5,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
1 3 c
2 4 d
3 5 e
[[3]]
x y
1 5 e
2 6 a
3 7 b
[[4]]
x y
1 7 b
2 8 c
3 9 d
[[5]]
x y
1 8 d
2 9 e
3 10 f
I haven't thought a lot about what would happen if you tried something like OverLapSplit(DF,nsplits=2,overlap=1)
Maybe the following:
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
Thanks!
Try something like :
OverlapSplit <- function(x,nsplit=1,overlap=2){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
with nsplit the number of splits! (nsplit=1 returns 2 dataframes). This will render an incomplete last dataframe in case the overlap splits don't really fit in the dataframe, and issues a warning.
> OverlapSplit(DF,nsplit=3,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
[[2]]
x y
3 3 c
4 4 d
5 5 e
6 6 a
[[3]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
[[4]]
x y
7 7 b
8 8 c
9 9 d
10 10 e
And one with a warning
> OverlapSplit(DF,nsplit=1,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
NA NA <NA>
Warning message:
In OverlapSplit(DF, nsplit = 1, overlap = 1) :
Returning an incomplete dataframe.
This uses the shingle idea from Lattice graphics and so leverages code from package lattice to generate the intervals and then uses a loop to break the original DF into the correct subsets.
I wasn't exactly sure what is meant by overlap = 1 - I presume you meant overlap by 1 sample/observation. If so, the code below does this.
OverlapSplit <- function(x, nsplits = 1, overlap = 0) {
stopifnot(require(lattice))
N <- seq_len(nr <- nrow(x))
interv <- co.intervals(N, nsplits, overlap / nr)
out <- vector(mode = "list", length = nrow(interv))
for(i in seq_along(out)) {
out[[i]] <- x[interv[i,1] < N & N < interv[i,2], , drop = FALSE]
}
out
}
Which gives:
> OverlapSplit(DF, 2, 2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF, 4, 1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
3 3 c
4 4 d
5 5 e
[[3]]
x y
6 6 a
7 7 b
8 8 c
[[4]]
x y
8 8 c
9 9 d
10 10 e
Just to make it clear what I'm doing here:
#Load Libraries
library(PerformanceAnalytics)
library(quantmod)
#Function to Split Data Frame
OverlapSplit <- function(x,nsplit=1,overlap=0){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
#Function to run regression on 30 days to predict the next day
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
#Function to roll the regression
RollMyRegression <- function(data,ModelFUN,FL) {
rollapply(data, width=31,FUN=ModelFUN,FL,
by.column = FALSE, align = "right", na.pad = FALSE)
}
#Load Data
data(managers)
#Split Dataset
split.data <- OverlapSplit(managers,2,30)
sapply(split.data,dim)
#Run rolling regression on each split
output <- lapply(split.data,RollMyRegression,MyRegression,FL)
output
unlist(output)
In this manner, you can replace lapply at the end with a parallel version of lapply and increase your speed somewhat.
Of course, now there's the issue of optimizing the split/overlap, given you number of processors and the size of your dataset.

Resources