R format the wide table to long table - r

cutoff KM KM_lo KM_hi rstm rstm_lo rstm_hi
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-01-01 2.1 1.4 4.9 7.2 3.9 10.2
2 2017-04-01 3.5 2.1 4.7 8.9 6.6 10.8
3 2017-07-01 3.7 2.8 4.2 7.2 6.2 8.4
How do I convert this to a long table? I am struggling to create it into the format I want. I tried the gather and melt functions. The output table would look something like this
cutoff VAR Val Val-hi Val-lo
<chr> <chr> <dbl> <dbl> <dbl>
1 2017-01-01 KM 2.1 4.9 1.4
2 2017-01-01 rstm 7.2 4.7 3.9
3 2017-07-01 KM 3.7 4.2 2.8
Sample date
structure(list(cutoff = c("2017-01-01", "2017-04-01", "2017-07-01"
), KM = c(2.1, 3.5, 3.7), KM_lo = c(1.4, 2.1, 2.8), KM_hi = c(4.9,
4.7, 4.2), rstm = c(7.2, 8.9, 7.2), rstm_lo = c(3.9, 6.6, 6.2
), rstm_hi = c(10.2, 10.8, 8.4)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

We may do
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
rename_with(~ str_c(., "_none"), c("KM", "rstm")) %>%
pivot_longer(cols = -cutoff, names_to = c("VAR", ".value"),
names_sep = "_") %>%
rename_with(~ c("Val", "Val-lo", "Val-hi"), 3:5)
-output
# A tibble: 6 × 5
cutoff VAR Val `Val-lo` `Val-hi`
<chr> <chr> <dbl> <dbl> <dbl>
1 2017-01-01 KM 2.1 1.4 4.9
2 2017-01-01 rstm 7.2 3.9 10.2
3 2017-04-01 KM 3.5 2.1 4.7
4 2017-04-01 rstm 8.9 6.6 10.8
5 2017-07-01 KM 3.7 2.8 4.2
6 2017-07-01 rstm 7.2 6.2 8.4

Here is another pivot_longer approach:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(
-cutoff,
names_to = c("VAR", ".value"),
names_pattern = "(.+)_(.+)"
) %>%
na.omit()
cutoff VAR lo hi
<chr> <chr> <dbl> <dbl>
1 2017-01-01 KM 1.4 4.9
2 2017-01-01 rstm 3.9 10.2
3 2017-04-01 KM 2.1 4.7
4 2017-04-01 rstm 6.6 10.8
5 2017-07-01 KM 2.8 4.2
6 2017-07-01 rstm 6.2 8.4

library(tidyverse)
df <-
structure(
list(
cutoff = c("2017-01-01", "2017-04-01", "2017-07-01"),
KM = c(2.1, 3.5, 3.7),
KM_lo = c(1.4, 2.1, 2.8),
KM_hi = c(4.9, 4.7, 4.2),
rstm = c(7.2, 8.9, 7.2),
rstm_lo = c(3.9, 6.6, 6.2),
rstm_hi = c(10.2, 10.8, 8.4)
),
row.names = c(NA,-3L),
class = c("tbl_df",
"tbl", "data.frame")
)
df %>%
pivot_longer(cols = -cutoff) %>%
separate(col = name, into = c("name", "suffix"), sep = "_", remove = TRUE) %>%
mutate(id = data.table::rleid(name)) %>%
pivot_wider(id_cols = c(id, cutoff, name), names_from = suffix, names_prefix = "VAL_", values_from = value) %>%
select(-id) %>%
rename(VAL = VAL_NA)
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 6 rows [1, 4, 7,
#> 10, 13, 16].
#> # A tibble: 6 x 5
#> cutoff name VAL VAL_lo VAL_hi
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 2017-01-01 KM 2.1 1.4 4.9
#> 2 2017-01-01 rstm 7.2 3.9 10.2
#> 3 2017-04-01 KM 3.5 2.1 4.7
#> 4 2017-04-01 rstm 8.9 6.6 10.8
#> 5 2017-07-01 KM 3.7 2.8 4.2
#> 6 2017-07-01 rstm 7.2 6.2 8.4
Created on 2021-09-28 by the reprex package (v2.0.1)

Related

Compute multiple column to column differences R

I would like to compute differences among several columns, per identifiers (see script below for reproducible example and target data frame).
This question is somehow similar, but only for pairs of identifiers. I can't think on how to adapt it.
I could also have several data frame, one per identifier, but I also don't know in that case how to compute multiple columns differences.
The code below allows to create a sample dataset, and has the code I currently use. It gives me what I want, I'd just like to know if there is a way not to spell out all the differences I want to compute (in my dataset, I have more parameters and depths than in that sample data).
Thanks in advance for your help!
library(tidyverse)
# sample data
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# Reshape the data and compute differences
dt %>% pivot_wider(id_cols = c(site,date), names_from = c(parameter,depth), values_from = value, names_sep = "_") %>%
# do the difference, depth to depth, parameter by parameter
# What I would like is not have to write manually each differences pair
mutate(temperature_1_2 = temperature_1 - temperature_2,
temperature_1_3 = temperature_1 - temperature_3,
temperature_2_3 = temperature_2 - temperature_3,
oxygen_1_2 = oxygen_1 - oxygen_2,
oxygen_1_3 = oxygen_1 - oxygen_3,
oxygen_2_3 = oxygen_2 - oxygen_3)
library(tidyverse)
library(rlang)
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# result
temperature <- str_c("temperature_", 1:3)
oxygen <- str_c("oxygen_", 1:3)
temperature_frml <- combn(temperature, m = 2, FUN = function(x) str_c(x, collapse = " - "))
oxygen_frml <- combn(oxygen, m = 2, FUN = function(x) str_c(x, collapse = " - "))
all_frml <- c(temperature_frml, oxygen_frml)
df_wider <- dt %>% pivot_wider(
id_cols = c(site, date),
names_from = c(parameter, depth),
values_from = value,
names_sep = "_"
)
bind_cols(df_wider,
map_dfc(
.x = all_frml,
.f = ~ transmute(.data = df_wider,!!.x := eval(parse_expr(.x)))
))
#> # A tibble: 20 x 14
#> site date temperature_1 temperature_2 temperature_3 oxygen_1 oxygen_2
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021-12-11 17.6 17.1 12.9 7.34 6.86
#> 2 A 2021-12-12 17.6 17.1 12.9 7.34 6.86
#> 3 A 2021-12-13 17.6 17.1 12.9 7.34 6.86
#> 4 A 2021-12-14 17.6 17.1 12.9 7.34 6.86
#> 5 B 2021-12-11 17.1 15.6 13.7 8.52 7.58
#> 6 B 2021-12-12 17.1 15.6 13.7 8.52 7.58
#> 7 B 2021-12-13 17.1 15.6 13.7 8.52 7.58
#> 8 B 2021-12-14 17.1 15.6 13.7 8.52 7.58
#> 9 C 2021-12-11 17.7 15.5 13.6 7.66 7.31
#> 10 C 2021-12-12 17.7 15.5 13.6 7.66 7.31
#> 11 C 2021-12-13 17.7 15.5 13.6 7.66 7.31
#> 12 C 2021-12-14 17.7 15.5 13.6 7.66 7.31
#> 13 D 2021-12-11 16.5 16.4 14.5 7.50 7.27
#> 14 D 2021-12-12 16.5 16.4 14.5 7.50 7.27
#> 15 D 2021-12-13 16.5 16.4 14.5 7.50 7.27
#> 16 D 2021-12-14 16.5 16.4 14.5 7.50 7.27
#> 17 E 2021-12-11 16.7 16.1 15.7 7.52 7.51
#> 18 E 2021-12-12 16.7 16.1 15.7 7.52 7.51
#> 19 E 2021-12-13 16.7 16.1 15.7 7.52 7.51
#> 20 E 2021-12-14 16.7 16.1 15.7 7.52 7.51
#> # ... with 7 more variables: oxygen_3 <dbl>,
#> # temperature_1 - temperature_2 <dbl>, temperature_1 - temperature_3 <dbl>,
#> # temperature_2 - temperature_3 <dbl>, oxygen_1 - oxygen_2 <dbl>,
#> # oxygen_1 - oxygen_3 <dbl>, oxygen_2 - oxygen_3 <dbl>
Created on 2021-12-14 by the reprex package (v2.0.1)

Divide all values by reference row

Although this seems similar to this, I'm looking for a "tidy" solution...
Let's look at the following data (it's rocks compositions for some chemical elements, if you are curious):
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 31 63 7.1 27 4.7 1 4 0.7 3.9 0.83 2.3 0.3 1.96 0.31
2 Middle CC 24 53 5.8 25 4.6 1.4 4 0.7 3.8 0.82 2.3 0.32 2.2 0.4
3 Lower CC 8 20 2.4 11 2.8 1.1 3.1 0.48 3.1 0.68 1.9 0.24 1.5 0.25
4 chondrite 0.235 0.603 0.0891 0.452 0.147 0.056 0.197 0.0363 0.243 0.0556 0.159 0.0242 0.162 0.0243
(see at the end for the dput)
This is made of three samples and a reference value (chondrite). I want to normalize the value of each element by the chondrite, for each sample, i.e. get something like that:
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.0 14.9 14.5 12.4 12.1 12.8
2 Middle CC 102. 87.9 65.1 55.3 31.3 25 20.3 19.3 15.6 14.8 14.5 13.2 13.6 16.5
3 Lower CC 34.0 33.2 26.9 24.3 19.0 19.6 15.7 13.2 12.8 12.2 12.0 9.92 9.26 10.3
4 chondrite 1 1 1 1 1 1 1 1 1 1 1 1 1 1
In which, of course, the first 132 for df["Upper CC","La"] comes from 31 / 0.235, i.e. df["Upper CC","La"] / df["chondrite","La"]
This is trivial in excel, and can be done in plain R with something along the lines of
apply(df[,-1],1,FUN=function(z){return(z/df[4,-1])})
Give or take some unlist() and other niceties.
But how do I do this in tidyverse idiom ? I started constructing
df %>% mutate(across( where(is.numeric), ... ? .... ) )
... but could not go further.
Generalize/related question: instead of normalizing by df[4,], normalize by an arbitrary named vector.
dput(df)
structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.2347), Ce = c(63, 53, 20, 0.6032
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.4524),
Sm = c(4.7, 4.6, 2.8, 0.1471), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.1966), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.2427), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.1589), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.1625), Lu = c(0.31, 0.4,
0.25, 0.0243)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
You can use :
library(dplyr)
df %>% mutate(across(where(is.numeric), ~./.[Rock == "chondrite"]))
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Upper … 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.1
#2 Middle… 102. 87.9 65.1 55.3 31.3 25.0 20.3 19.3 15.7
#3 Lower … 34.1 33.2 26.9 24.3 19.0 19.6 15.8 13.2 12.8
#4 chondr… 1 1 1 1 1 1 1 1 1
# … with 5 more variables: Ho <dbl>, Er <dbl>, Tm <dbl>,
# Yb <dbl>, Lu <dbl>
Using matrix calculations.
m <- t(dat[-1])
dat[-1] <- t(m / m[,4])
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
# 1 Upper CC 131.91489 104.47761 79.68575 59.73451 31.97279 17.85714 20.30457 19.28375 16.04938 14.92806 14.46541 12.396694 12.098765 12.75720
# 2 Middle CC 102.12766 87.89386 65.09540 55.30973 31.29252 25.00000 20.30457 19.28375 15.63786 14.74820 14.46541 13.223140 13.580247 16.46091
# 3 Lower CC 34.04255 33.16750 26.93603 24.33628 19.04762 19.64286 15.73604 13.22314 12.75720 12.23022 11.94969 9.917355 9.259259 10.28807
# 4 chondrite 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.000000 1.000000 1.00000
Data
dat <- structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.235), Ce = c(63, 53, 20, 0.603
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.452),
Sm = c(4.7, 4.6, 2.8, 0.147), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.197), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.243), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.159), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.162), Lu = c(0.31, 0.4,
0.25, 0.0243)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Using data.table
library(data.table)
setDT(df1)[, (names(df1)[-1]) := lapply(.SD, function(x)
x/x[match( "chondrite", Rock)]), .SDcols = -1]

Error when Running cor.test on few columns with apply in R

I want to calculate the correlation between 'y' column and each column in 'col_df' dataframe.
For each calculation I want to save only the columns name with significant p_value (p_value<0.05).
y is a vector 64X1 of 0 and 1.
Example of the col_df- 60X12000
a b c d e
7.6 4.9 8.9 6.0 4.2
25.0 6.5 4.6 13.2 3.0
col_df <- as.matrix(df)
test <- col_df[, apply(col_df, MARGIN = 2, FUN = function(x)
(cor.test(y, col_df[,x], method = "pearson")$p.value <0.05))]
This is the error:
Error in col_df[, x] : subscript out of bounds
Is this the way to do that?
This is a working solution:
df <- structure(list(a = c(7.6, 7.6, 25, 25, 25, 25, 7.6, 7.6, 7.6, 25),
b = c(4.9, 4.9, 6.5, 6.5, 4.9, 6.5, 4.9, 4.9, 6.5, 6.5),
c = c(8.9, 4.6, 8.9, 8.9, 8.9, 4.6, 4.6, 8.9, 8.9, 4.6),
d = c(13.2, 13.2, 6, 6, 6, 6, 6, 13.2, 13.2, 13.2),
e = c(3, 4.2, 3, 4.2, 3, 3, 3, 4.2, 4.2, 4.2)),
class = "data.frame", row.names = c(NA, -10L))
y <- c(1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L)
test <- df[, apply(df, MARGIN = 2, FUN = function(x)
(cor.test(y, x, method = "pearson")$p.value < 0.05))]
test
#> a b
#> 1 7.6 4.9
#> 2 7.6 4.9
#> 3 25.0 6.5
#> 4 25.0 6.5
#> 5 25.0 4.9
#> 6 25.0 6.5
#> 7 7.6 4.9
#> 8 7.6 4.9
#> 9 7.6 6.5
#> 10 25.0 6.5
The difference to your solution ist that apply() gives you the column as x and
not an index. Hence, all you have to do is replace col_df[,x] of your solution with
just x.
You can simplify it a little with sapply(). I also recommend not to put everything into
a single line. It is hard to read and harder to debug.
Columns <- sapply(df, FUN = function(x) (cor.test(y, x, method = "pearson")$p.value < 0.05))
test <- df[, Columns]
test
#> a b
#> 1 7.6 4.9
#> 2 7.6 4.9
#> 3 25.0 6.5
#> 4 25.0 6.5
#> 5 25.0 4.9
#> 6 25.0 6.5
#> 7 7.6 4.9
#> 8 7.6 4.9
#> 9 7.6 6.5
#> 10 25.0 6.5
Created on 2020-07-22 by the reprex package (v0.3.0)

Split Default Without Sorting It By Column Names

On the following data I am running split.default() in R. The problem is that the separated sub data.frames() in the list are getting into sorted based on column name.
I don't want this to occur and want to preserve the column name sequence as that is the original data. Is there an approach that I can follow to do so? Please suggest.
Input Data
data <- structure(list(`B-DIODE` = c(1.2, 0.4), `B-DIODE` = c(1.3, 0.6
), `A-DIODE` = c(1.4, 0.8), `A-ACC1` = c(1.5, 1), `A-ACC2` = c(1.6,
1.2), `A-ANA0` = c(1.7, 1.4), `A-ANA1` = c(1.8, 1.6), `A-BRICKID` = c(1.9,
1.8), `A-CC0` = c(2L, 2L), `A-CC1` = c(2.1, 2.2), `A-DIGDN` = c(2.2,
2.4), `A-DIGDP` = c(2.3, 2.6), `A-DN1` = c(2.4, 2.8), `A-DN2` = c(2.5,
3), `A-DP1` = c(2.6, 3.2), `A-DP2` = c(2.7, 3.4), `A-SCL` = c(2.8,
3.6), `A-SDA` = c(2.9, 3.8), `A-USB0DN` = 3:4, `A-USB0DP` = c(3.1,
4.2), `A-USB1DN` = c(3.2, 4.4), `A-USB1DP` = c(3.3, 4.6), `A-ACC1` = c(3.4,
4.8), `A-ACC2` = c(3.5, 5), `A-ANA0` = c(3.6, 5.2), `A-ANA1` = c(3.7,
5.4), `A-BRICKID` = c(3.8, 5.6), `A-CC0` = c(3.9, 5.8), `A-CC1` = c(4L,
6L), `A-DIGDN` = c(4.1, 6.2), `A-DIGDP` = c(4.2, 6.4), `A-DN1` = c(4.3,
6.6), `A-DN2` = c(4.4, 6.8), `A-DP1` = c(4.5, 7), `A-DP2` = c(4.6,
7.2), `A-SCL` = c(4.7, 7.4), `A-SDA` = c(4.8, 7.6), `A-USB0DN` = c(4.9,
7.8), `A-USB0DP` = c(5L, 8L), `A-USB1DN` = c(5.1, 8.2), `A-USB1DP` = c(5.2,
8.4), `A-NA` = c(5.3, 8.6), `A-ACC2PWRLKG_0v4` = c(5.4, 8.8),
`A-ACC2PWRLKG_0v4` = c(5.5, 9), `A-P_IN_Leak` = c(5.6, 9.2
)), class = "data.frame", row.names = c(NA, -2L))
Code
split.default(data, sub("-.*", "", names(data)))
Output
$`A`
A-DIODE A-ACC1 A-ACC2 A-ANA0 A-ANA1 A-BRICKID A-CC0 A-CC1 A-DIGDN A-DIGDP A-DN1 A-DN2 A-DP1 A-DP2 A-SCL A-SDA A-USB0DN A-USB0DP A-USB1DN A-USB1DP
1 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3
2 0.8 1.0 1.2 1.4 1.6 1.8 2 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4 4.2 4.4 4.6
A-ACC1.1 A-ACC2.1 A-ANA0.1 A-ANA1.1 A-BRICKID.1 A-CC0.1 A-CC1.1 A-DIGDN.1 A-DIGDP.1 A-DN1.1 A-DN2.1 A-DP1.1 A-DP2.1 A-SCL.1 A-SDA.1 A-USB0DN.1
1 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9
2 4.8 5.0 5.2 5.4 5.6 5.8 6 6.2 6.4 6.6 6.8 7.0 7.2 7.4 7.6 7.8
A-USB0DP.1 A-USB1DN.1 A-USB1DP.1 A-NA A-ACC2PWRLKG_0v4 A-ACC2PWRLKG_0v4.1 A-P_IN_Leak
1 5 5.1 5.2 5.3 5.4 5.5 5.6
2 8 8.2 8.4 8.6 8.8 9.0 9.2
$B
B-DIODE B-DIODE.1
1 1.2 1.3
2 0.4 0.6
In the above output I want the $B to appear first and then $A as that's the sequence that Input Data followed.
One option is to convert the names to factor and set the levels as needed
new_name <- sub("-.*", "", names(data))
split.default(data, factor(new_name, levels = unique(new_name)))
#$B
# B-DIODE B-DIODE.1
#1 1.2 1.3
#2 0.4 0.6
#$A
# A-DIODE A-ACC1 A-ACC2 A-ANA0 ....
#1 1.4 1.5 1.6 1.7 ....
#2 0.8 1.0 1.2 1.4 ....
by specifying levels as unique(new_name) we can ensure that the list will be split based on their occurrence in the dataframe and not alphabetically.
As #thelatemail suggests we can also avoid converting names to factor variable by reordering the list based on unique new_name
split.default(data, new_name)[unique(new_name)]
Another option is to create the group indices for splitting using rle
rl <- rle(sub("-.*", "", names(data)))
split.default(data, rep(1:length(rl), rl$length))
#$`1`
# B-DIODE B-DIODE.1
#1 1.2 1.3
#2 0.4 0.6
#
#$`2`
# A-DIODE A-ACC1 A-ACC2 A-ANA0 A-ANA1 A-BRICKID A-CC0 A-CC1 A-DIGDN A-DIGDP
#1 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3
#2 0.8 1.0 1.2 1.4 1.6 1.8 2 2.2 2.4 2.6
# A-DN1 A-DN2 A-DP1 A-DP2 A-SCL A-SDA A-USB0DN A-USB0DP A-USB1DN A-USB1DP
#1 2.4 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3
#2 2.8 3.0 3.2 3.4 3.6 3.8 4 4.2 4.4 4.6
# A-ACC1.1 A-ACC2.1 A-ANA0.1 A-ANA1.1 A-BRICKID.1 A-CC0.1 A-CC1.1 A-DIGDN.1
#1 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1
#2 4.8 5.0 5.2 5.4 5.6 5.8 6 6.2
# A-DIGDP.1 A-DN1.1 A-DN2.1 A-DP1.1 A-DP2.1 A-SCL.1 A-SDA.1 A-USB0DN.1
#1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9
#2 6.4 6.6 6.8 7.0 7.2 7.4 7.6 7.8
# A-USB0DP.1 A-USB1DN.1 A-USB1DP.1 A-NA A-ACC2PWRLKG_0v4 A-ACC2PWRLKG_0v4.1
#1 5 5.1 5.2 5.3 5.4 5.5
#2 8 8.2 8.4 8.6 8.8 9.0
# A-P_IN_Leak
#1 5.6
#2 9.2

Add a date column to a data frame after aggregating from weekly to monthly

I have the following weekly data frame:
df <- data.frame( Date = c("2017-08-01","2017-08-08","2017-08-15", "2017-08-22", "2017-08-29", "2017-09-05"), item1 = c(1.6,1.8,1.6, 2.0, 1.4, 1.5), item2 = c(38.6,35.1,42.6, 43.1, 42, 41), item3 = c(16.9, 17.6, 18.5, 19.8, 17, 18))
> df
Date item1 item2 item3
1 2017-08-01 1.6 38.6 16.9
2 2017-08-08 1.8 35.1 17.6
3 2017-08-15 1.6 42.6 18.5
4 2017-08-22 2.0 43.1 19.8
5 2017-08-29 1.4 42.0 17.0
6 2017-09-05 1.5 41.0 18.0
Then I convert my df to a monthly dataframe using aggregate function.
df_monthly <- round(aggregate(zoo(df[,-1], as.Date(df$Date)), by=month, FUN=sum),0)
> df_monthly
item1 item2 item3
2017.08 8 201 90
2017.09 2 41 18
No I need to add a date column into df_monthly that shows the month and the year before I write the df_monthly into a csv file. I tried few different methods, but didn't work.
df_monthly$Date<-data.frame(seq(as.Date("2017/08/01"), as.Date("2017/09/05"), "months"))
> df_monthly
item1 item2 item3 Date
1 8 201 90 Numeric,2
6 2 41 18 Numeric,2
Any help will be highly appreciated.
library(dplyr)
library(lubridate)
df <- data.frame( Date = c("2017-08-01","2017-08-08","2017-08-15", "2017-08-22", "2017-08-29", "2017-09-05"), item1 = c(1.6,1.8,1.6, 2.0, 1.4, 1.5), item2 = c(38.6,35.1,42.6, 43.1, 42, 41), item3 = c(16.9, 17.6, 18.5, 19.8, 17, 18))
df %>%
mutate(year = year(Date), # get the year as a variable
month = month(Date)) %>% # get the month as a variable
group_by(year, month) %>% # group by year and month
summarise(item1 = sum(item1), # get the sums for each item
item2 = sum(item2),
item3 = sum(item3)) %>%
ungroup() # forget the grouping
# # A tibble: 2 x 5
# year month item1 item2 item3
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2017 8 8.4 201.4 89.8
# 2 2017 9 1.5 41.0 18.0
In case you have more than 3 items as columns and you want a more general solution you can use this
library(dplyr)
library(lubridate)
library(tidyr)
df <- data.frame( Date = c("2017-08-01","2017-08-08","2017-08-15", "2017-08-22", "2017-08-29", "2017-09-05"), item1 = c(1.6,1.8,1.6, 2.0, 1.4, 1.5), item2 = c(38.6,35.1,42.6, 43.1, 42, 41), item3 = c(16.9, 17.6, 18.5, 19.8, 17, 18))
df %>%
gather(item, value, -Date) %>% # reshape dataset
mutate(year = year(Date), # get the year as a variable
month = month(Date)) %>% # get the month as a variable
group_by(year, month, item) %>% # group by year, month and item
summarise(value = sum(value)) %>% # get the sum of the values
spread(item, value) %>% # reshape dataset back to initial form
ungroup() # forget the grouping
# # A tibble: 2 x 5
# year month item1 item2 item3
# * <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2017 8 8.4 201.4 89.8
# 2 2017 9 1.5 41.0 18.0

Resources