Combination of vectors' values with the same positional indices in R - r

Assuming we have the following simplified vectors (in reality, they contain much more values):
n <- c(1,2)
x <- c(4,5,6)
y <- c(7,8,9)
#to get all possible combinations, we can use expand.grid
df <- expand.grid(n=n,
                  x=x,
                  y=y
)
> df
n x y
1 4 7
2 4 7
1 5 7
2 5 7
1 6 7
2 6 7
1 4 8
2 4 8
1 5 8
2 5 8
1 6 8
2 6 8
1 4 9
2 4 9
1 5 9
2 5 9
1 6 9
2 6 9
However, I would like vectors x, y to have the combination where only elements with the same index values are considered, i.e. (x1, y1), (x2, y2), (x3, y3) but NOT (x1,y2), (x1,y3), etc.
while vector n is still used as usual (all its elements are 'paired' with the outcome of x and y combination).
In other words, I would like to get the following df:  
n x y
1 4 7
2 4 7
1 5 8
2 5 8
1 6 9
2 6 9
if n vector had 3 elements, i.e. n <- (1, 2, 3), then we would have:
n x y
1 4 7
2 4 7
3 4 7
1 5 8
2 5 8
3 5 8
1 6 9
2 6 9
3 6 9

You could combine list of pairs that need to be together and then use it in expand.grid
expand.grid(n, Map(c, x, y)) %>% tidyr::unnest_wider(Var2)
Or we can also use crossing using the same logic.
library(tidyverse)
crossing(n, x = map2(x, y, c)) %>%
unnest_wider(x) %>%
rename_at(-1, ~c("x", "y"))
# n x y
# <dbl> <dbl> <dbl>
#1 1 4 7
#2 1 5 8
#3 1 6 9
#4 2 4 7
#5 2 5 8
#6 2 6 9

We can create a function to do this
f1 <- function(vec1, vec2, n) {
d1 <- data.frame(x = vec1, y = vec2)
d2 <- transform(d1[rep(seq_len(nrow(d1)), each = length(n)), ], n = n)
row.names(d2) <- NULL
d2[c('n', 'x', 'y')]
}
f1(x, y, n = 1:2)
# n x y
#1 1 4 7
#2 2 4 7
#3 1 5 8
#4 2 5 8
#5 1 6 9
#6 2 6 9
f1(x, y, n = 1:3)
# n x y
#1 1 4 7
#2 2 4 7
#3 3 4 7
#4 1 5 8
#5 2 5 8
#6 3 5 8
#7 1 6 9
#8 2 6 9
#9 3 6 9
Or in tidyverse
library(dplyr)
library(tidyr)
tibble(x, y) %>%
uncount(length(n)) %>%
mutate(n = rep(n, length.out = n())) %>%
select(n, x, y)
# A tibble: 9 x 3
# n x y
# <int> <dbl> <dbl>
#1 1 4 7
#2 2 4 7
#3 3 4 7
#4 1 5 8
#5 2 5 8
#6 3 5 8
#7 1 6 9
#8 2 6 9
#9 3 6 9
Or create a tibble first and then use that with crossing
tibble(x, y) %>%
crossing(n)
data
n <- 1:3

Here's a tidyverse solution, using purrr::map_df:
library(tidyverse)
map_df(n, ~tibble(n=.x, x, y))
n x y
<dbl> <dbl> <dbl>
1 1 4 7
2 1 5 8
3 1 6 9
4 2 4 7
5 2 5 8
6 2 6 9
If you need the values sorted exactly like your example output, add %>% arrange(x, y) to the output of map.

One option would be to paste together x and y, then use expand grid and separate the columns using the separate function from the tidyr package.
library(dplyr) #for pipe
library(tidyr) #for separate
n <- c(1,2)
x <- c(4,5,6)
y <- c(7,8,9)
z <- paste(x, y, sep = "-")
expand.grid(n = n, xy = z) %>%
separate(xy, sep = "-", into = c("x", "y")) %>%
mutate(x = as.numeric(x), y = as.numeric(y)) %>%
as.tibble()

Related

Transforming columns every nth row to multiple rows

I have a data frame that I successfully separated every nth row of a column and arranged the snippets into rows:
list = data.frame(x = c("A", "1", "2", "3", "B", "1", "2", "3"))
x
1 A
2 1
3 2
4 3
5 B
6 1
7 2
8 3
to
w x y z
1 A 1 2 3
2 B 1 2 3
I managed to achieve this with:
table <- data.frame(matrix(list$x, ncol = 4, byrow = TRUE))
In the next step I would like to do this with multiple columns and place the resulting tables beneath each other.
From something like this:
x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0
to
w x y z
1 A 3 3 6
2 B 4 3 7
3 C 7 3 6
4 D 5 7 6
5 E 5 3 7
6 F 4 6 8
7 G 6 4 7
8 H 2 4 6
9 I 3 9 0
I am really stuck with this one.
If someone has an idea I would highly appreciate some help.
Making use of lapply and dplyr::bind_rows() this could be achieved like so:
library(dplyr)
df_list <- lapply(list, function(x) data.frame(matrix(x, ncol = 4, byrow = TRUE)))
df_list %>%
dplyr::bind_rows() %>%
setNames(c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
Or using do.call and rbind:
df_list <- do.call(rbind, c(df_list, list(make.row.names = FALSE)))
setNames(df_list, c("w", "x", "y", "z"))
#> w x y z
#> 1 A 3 3 6
#> 2 B 4 3 7
#> 3 C 7 3 6
#> 4 D 5 7 7
#> 5 E 5 3 7
#> 6 F 4 6 8
#> 7 G 6 4 5
#> 8 H 2 4 6
#> 9 I 3 9 0
DATA
list <- read.table(text = " x y z
1 A D G
2 3 5 6
3 3 7 4
4 6 7 5
5 B E H
6 4 5 2
7 3 3 4
8 7 7 6
9 C F I
10 7 4 3
11 3 6 9
12 6 8 0", header = TRUE)
Here is another solution. You could use good old base::by function in order to split a data set into groups and apply a function on each chunk. (Here is to dear #Henrik who taught me this valuable trick):
do.call(rbind, by(df, rep(seq_len(nrow(df)/4), each = 4), FUN = \(x) {
{setNames(as.data.frame(t(x[-4])), c("w", "x", "y", "z")) |>
`rownames<-`(NULL)}
})) |> `rownames<-`(NULL)
w x y z
1 A 3 3 6
2 D 5 7 7
3 G 6 4 5
4 B 4 3 7
5 E 5 3 7
6 H 2 4 6
7 C 7 3 6
8 F 4 6 8
9 I 3 9 0
Because the number of rows in each chunk seems to be known and constant, you may unlist the data frame, and use modulo (%%) to distinguish between the characters belonging to the "sub headers" and the numeric values.
v = unlist(d)
i = (1:nrow(d) - 1) %% 4 == 0
data.frame(w = v[i],
matrix(v[!i], ncol = 3, byrow = TRUE, dimnames = list(NULL, names(d))))
w x y z
x1 A 3 3 6
x5 B 4 3 7
x9 C 7 3 6
y1 D 5 7 7
y5 E 5 3 7
y9 F 4 6 8
z1 G 6 4 5
z5 H 2 4 6
z9 I 3 9 0
Wrap v[i] in as.integer or as.numeric if that's the desired class.
Add row.names = NULL to the data.frame call if you happen to care about row names.
Base R Split-Apply-Combine, a bit-ugly but is generic enough to handle data having more or less than 4 rows per matrix:
res <- data.frame(
do.call(
rbind,
lapply(
with(
lst,
split(
lst,
cumsum(
apply(
lst,
1,
function(row){
all(
grepl(
"[a-zA-Z]",
row
)
)
}
)
)
)
),
function(x){
type.convert(
setNames(
data.frame(
t(x),
stringsAsFactors = FALSE
),
letters[23:26]
)
)
}
)
),
row.names = NULL
)

Sum up tables results from multiple sheets into one table in R

I am reading through excel file that has multiple sheets.
file_to_read <- "./file_name.xlsx"
# Get all names of sheets in the file
sheet_names <- readxl::excel_sheets(file_to_read)
# Loop through sheets
L <- lapply(sheet_names, function(x) {
all_cells <-
tidyxl::xlsx_cells(file_to_read, sheets = x)
})
L here has all the sheets. Now, I need to get the data from each sheet to combine all the columns and rows into one file. To be exact, I want to sum the matching columns and rows in the data into one file.
I will put simple example to make it clear.
For example, this table in one sheet,
df1 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df1) <- LETTERS[1:5]
df1
M x y z
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
E 5 6 7
The second table in the next sheet,
df2 <- data.frame(x = 1:5, y = 2:6, z = 3:7, w = 8:12)
rownames(df2) <- LETTERS[3:7]
df2
M x y z w
C 1 2 3 8
D 2 3 4 9
E 3 4 5 10
F 4 5 6 11
G 5 6 7 12
My goal is to combine (sum) the matched records in all 100 tables from one excel file to get one big tables that has the total sum of each value.
The final table should be like this:
M x y z w
A 1 2 3 0
B 2 3 4 0
C 4 6 8 8
D 6 8 10 9
E 8 10 12 10
F 4 5 6 11
G 5 6 7 12
Is there a way to achieve this in R? I am not an expert in R, but I wish if I could know how to read all sheets and do the sum Then save the output to a file.
Thank you
As you have stated that you have hundreds of sheets it is suggested that you should import all of these in one single list say my.list in R (as per this link or this readxl documentation suggested) and follow this strategy instead of binding every two dfs one by one
df1 <- read.table(text = 'M x y z
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
E 5 6 7', header = T)
df2 <- read.table(text = 'M x y z w
C 1 2 3 8
D 2 3 4 9
E 3 4 5 10
F 4 5 6 11
G 5 6 7 12', header = T)
library(tibble)
library(tidyverse)
my.list <- list(df1, df2)
map_dfr(my.list, ~.x)
#> M x y z w
#> 1 A 1 2 3 NA
#> 2 B 2 3 4 NA
#> 3 C 3 4 5 NA
#> 4 D 4 5 6 NA
#> 5 E 5 6 7 NA
#> 6 C 1 2 3 8
#> 7 D 2 3 4 9
#> 8 E 3 4 5 10
#> 9 F 4 5 6 11
#> 10 G 5 6 7 12
map_dfr(my.list , ~ .x) %>%
group_by(M) %>%
summarise(across(everything(), sum, na.rm = T))
#> # A tibble: 7 x 5
#> M x y z w
#> <chr> <int> <int> <int> <int>
#> 1 A 1 2 3 0
#> 2 B 2 3 4 0
#> 3 C 4 6 8 8
#> 4 D 6 8 10 9
#> 5 E 8 10 12 10
#> 6 F 4 5 6 11
#> 7 G 5 6 7 12
Created on 2021-05-26 by the reprex package (v2.0.0)
One approach that will work is these steps:
read each sheet into a list
convert each sheet into a long format
bind into a single data frame
sum and group by over that long data frame
cast back to tabular format
That should work for N sheets with any combination of row and column headers in those sheets. E.g.
file <- "D:\\Book1.xlsx"
sheet_names <- readxl::excel_sheets(file)
sheet_data <- lapply(sheet_names, function(sheet_name) {
readxl::read_xlsx(path = file, sheet = sheet_name)
})
# use pivot_longer on each sheet to make long data
long_sheet_data <- lapply(sheet_data, function(data) {
long <- tidyr::pivot_longer(
data = data,
cols = !M,
names_to = "col",
values_to = "val"
)
})
# combine into a single tibble
long_data = dplyr::bind_rows(long_sheet_data)
# sum up matching pairs of `M` and `col`
summarised <- long_data %>%
group_by(M, col) %>%
dplyr::summarise(agg = sum(val))
# convert to a tabular format
tabular <- summarised %>%
tidyr::pivot_wider(
names_from = col,
values_from = agg,
values_fill = 0
)
tabular
I get this output with a spreadsheet using your initial inputs:
> tabular
# A tibble: 7 x 5
# Groups: M [7]
M x y z w
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1 2 3 0
2 B 2 3 4 0
3 C 4 6 8 8
4 D 6 8 10 9
5 E 8 10 12 10
6 F 4 5 6 11
7 G 5 6 7 12
You could use dplyr and tidyr to get your desired result:
Let be
df <- data.frame(subject=c(rep("Mother", 2), rep("Child", 2)), modifier=c("chart2", "child", "tech", "unkn"), mother_chart2=1:4, mother_child=5:8, child_tech=9:12, child_unkn=13:16)
> df
subject modifier mother_chart2 mother_child child_tech child_unkn
1 Mother chart2 1 5 9 13
2 Mother child 2 6 10 14
3 Child tech 3 7 11 15
4 Child unkn 4 8 12 16
and
df2 <- data.frame(subject=c(rep("Mother", 2), rep("Child", 2)), modifier=c("chart", "child", "tech", "unkn"), mother_chart=101:104, mother_child=105:108, child_tech=109:112, child_unkn=113:116)
> df2
subject modifier mother_chart mother_child child_tech child_unkn
1 Mother chart 101 105 109 113
2 Mother child 102 106 110 114
3 Child tech 103 107 111 115
4 Child unkn 104 108 112 116
Then
library(dplyr)
library(tidyr)
df2_tmp <- df2 %>%
pivot_longer(col=-c("subject", "modifier"))
df %>%
pivot_longer(col=-c("subject", "modifier")) %>%
full_join(df2_tmp, by=c("subject", "modifier", "name")) %>%
mutate(across(starts_with("value"), ~ replace_na(., 0)),
sum = value.x + value.y) %>%
select(-value.x, -value.y) %>%
pivot_wider(names_from=name, values_from=sum, values_fill=0)
returns
# A tibble: 5 x 7
subject modifier mother_chart2 mother_child child_tech child_unkn mother_chart
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Mother chart2 1 5 9 13 0
2 Mother child 2 112 120 128 102
3 Child tech 3 114 122 130 103
4 Child unkn 4 116 124 132 104
5 Mother chart 0 105 109 113 101

Creating a New Column that Compared Two Other Columns in R

My data looks like:
x y
2 5
3 6
4 2
6 1
7 10
12 16
145 1
Looking to output the number that's smaller than the other into a new column which would look like:
x y z
2 5 2
3 6 3
4 2 2
6 1 1
7 10 7
12 16 12
145 1 1
None of the data will be equal so you don't need to worry about that.
x <- c(2,3,4,6,7,12,145)
y <- c(5,6,2,1,10,16,1)
df <- data.frame(x,y)
Using case_when from tidyverse
remove(list = ls())
x <- c(2,3,4,6,7,12,145)
y <- c(5,6,2,1,10,16,1)
df <- data.frame(x,y)
df <- df %>%
mutate(z =
case_when(
x < y ~ x,
TRUE ~ y
)
)
df
You can use pmin to get minimum between x and y columns.
df$z <- pmin(df$x, df$y)
df
# x y z
#1 2 5 2
#2 3 6 3
#3 4 2 2
#4 6 1 1
#5 7 10 7
#6 12 16 12
#7 145 1 1

Selecting top N rows for each group based on value in column

I have dataframe like below :-
x<-c(3,2,1,8,7,11,10,9,7,5,4)
y<-c("a","a","a", "b","b","c","c","c","c","c","c")
z<-c(2,2,2,1,1,3,3,3,3,3,3)
df<-data.frame(x,y,z)
df
x y z
1 3 a 2
2 2 a 2
3 1 a 2
4 8 b 1
5 7 b 1
6 11 c 3
7 10 c 3
8 9 c 3
9 7 c 3
10 5 c 3
11 4 c 3
I want to select top n row for each group by column y where n is provided in column z.
So the output should be like :
output:
x y z
1 3 a 2
2 2 a 2
3 8 b 1
4 11 c 3
5 10 c 3
6 9 c 3
A solution with base R:
# df is split according to y, then we keep only the top "z" value (after ordering x)
# and rbind everything back together:
do.call(rbind,
lapply(split(df, df$y),
function(df1) df1[order(df1$x, decreasing=TRUE), ][1:unique(df1$z), ]))
# x y z
#a.1 3 a 2
#a.2 2 a 2
#b 8 b 1
#c.6 11 c 3
#c.7 10 c 3
#c.8 9 c 3
EDIT:
A much more direct way (still in base R) provided in comment by #mt1022:
df[ave(1:nrow(df), df$y, FUN = seq_along) <= df$z, ]
# x y z
#1 3 a 2
#2 2 a 2
#4 8 b 1
#6 11 c 3
#7 10 c 3
#8 9 c 3
One approach with data.table:
library(data.table)
setDT(df)
df[,.(inc=seq_len(.N)<=z,x,z),by=.(y)][inc==T ,-2]
# y x z
#1: a 3 2
#2: a 2 2
#3: b 8 1
#4: c 11 3
#5: c 10 3
#6: c 9 3
A solution with dplyr that uses do:
df %>%
group_by(y) %>%
do(head(.,as.numeric(unique(.$z))))
I'm posting the solution I was looking for using dplyr. It is based on #HNSKD:
library(dplyr)
x<-c(3,2,1,8,7,11,10,9,7,5,4)
y<-c("a","a","a", "b","b","c","c","c","c","c","c")
z<-c(2,2,2,1,1,3,3,3,3,3,3)
df<-data.frame(x,y,z)
df %>% group_by(y) %>% slice(1:2)
Which returns the first two elements for each y:
# A tibble: 6 x 3
# Groups: y [3]
x y z
<dbl> <fct> <dbl>
1 3 a 2
2 2 a 2
3 8 b 1
4 7 b 1
5 11 c 3
6 10 c 3

Split a data frame into overlapping dataframes

I'm trying to write a function that behaves as follows, but it is proving very difficult:
DF <- data.frame(x = seq(1,10), y = rep(c('a','b','c','d','e'),2))
> DF
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
>OverLapSplit(DF,nsplits=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
>OverLapSplit(DF,nsplits=2,overlap=4)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
[[2]]
x y
1 4 e
2 5 a
3 6 b
4 7 c
5 8 d
6 9 e
7 10 a
>OverLapSplit(DF,nsplits=5,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
1 3 c
2 4 d
3 5 e
[[3]]
x y
1 5 e
2 6 a
3 7 b
[[4]]
x y
1 7 b
2 8 c
3 9 d
[[5]]
x y
1 8 d
2 9 e
3 10 f
I haven't thought a lot about what would happen if you tried something like OverLapSplit(DF,nsplits=2,overlap=1)
Maybe the following:
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
[[2]]
x y
1 5 a
2 6 b
3 7 c
4 8 d
5 9 e
6 10 a
Thanks!
Try something like :
OverlapSplit <- function(x,nsplit=1,overlap=2){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
with nsplit the number of splits! (nsplit=1 returns 2 dataframes). This will render an incomplete last dataframe in case the overlap splits don't really fit in the dataframe, and issues a warning.
> OverlapSplit(DF,nsplit=3,overlap=2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
[[2]]
x y
3 3 c
4 4 d
5 5 e
6 6 a
[[3]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
[[4]]
x y
7 7 b
8 8 c
9 9 d
10 10 e
And one with a warning
> OverlapSplit(DF,nsplit=1,overlap=1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
NA NA <NA>
Warning message:
In OverlapSplit(DF, nsplit = 1, overlap = 1) :
Returning an incomplete dataframe.
This uses the shingle idea from Lattice graphics and so leverages code from package lattice to generate the intervals and then uses a loop to break the original DF into the correct subsets.
I wasn't exactly sure what is meant by overlap = 1 - I presume you meant overlap by 1 sample/observation. If so, the code below does this.
OverlapSplit <- function(x, nsplits = 1, overlap = 0) {
stopifnot(require(lattice))
N <- seq_len(nr <- nrow(x))
interv <- co.intervals(N, nsplits, overlap / nr)
out <- vector(mode = "list", length = nrow(interv))
for(i in seq_along(out)) {
out[[i]] <- x[interv[i,1] < N & N < interv[i,2], , drop = FALSE]
}
out
}
Which gives:
> OverlapSplit(DF, 2, 2)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
[[2]]
x y
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF)
[[1]]
x y
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 a
7 7 b
8 8 c
9 9 d
10 10 e
> OverlapSplit(DF, 4, 1)
[[1]]
x y
1 1 a
2 2 b
3 3 c
[[2]]
x y
3 3 c
4 4 d
5 5 e
[[3]]
x y
6 6 a
7 7 b
8 8 c
[[4]]
x y
8 8 c
9 9 d
10 10 e
Just to make it clear what I'm doing here:
#Load Libraries
library(PerformanceAnalytics)
library(quantmod)
#Function to Split Data Frame
OverlapSplit <- function(x,nsplit=1,overlap=0){
nrows <- NROW(x)
nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
if( start[nsplit+1] + nperdf != nrows )
warning("Returning an incomplete dataframe.")
lapply(start, function(i) x[c(i:(i+nperdf-1)),])
}
#Function to run regression on 30 days to predict the next day
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
#Function to roll the regression
RollMyRegression <- function(data,ModelFUN,FL) {
rollapply(data, width=31,FUN=ModelFUN,FL,
by.column = FALSE, align = "right", na.pad = FALSE)
}
#Load Data
data(managers)
#Split Dataset
split.data <- OverlapSplit(managers,2,30)
sapply(split.data,dim)
#Run rolling regression on each split
output <- lapply(split.data,RollMyRegression,MyRegression,FL)
output
unlist(output)
In this manner, you can replace lapply at the end with a parallel version of lapply and increase your speed somewhat.
Of course, now there's the issue of optimizing the split/overlap, given you number of processors and the size of your dataset.

Resources