insert a column into a tabular object - r

I found this blog on getting a nested table into LaTeX format(Blog Link). I like the outcome but want to insert a column into the object at the beginning after the rownames. I'm used to dealing with data frames so dealing with this beast is more difficult than typical column indexing.
Here's what I have now:
pre post
approach mean sd mean sd
1 24.17 8.310 54.33 11.01
2 25.50 9.434 65.25 16.32
3 26.33 9.139 63.17 12.53
And here's what I'd like it to look like:
pre post
approach n mean sd mean sd
1 12 24.17 8.310 54.33 11.01
2 12 25.50 9.434 65.25 16.32
3 12 26.33 9.139 63.17 12.53
Here's the dput of z and also the column of n's I'd like to insert.
Thank you in advance.
z <- structure(list(24.1666666666667, 25.5, 26.3333333333333, 8.31027111835746,
9.4339811320566, 9.13866245766587, 54.3333333333333, 65.25,
63.1666666666667, 11.0068848977136, 16.3157759685081, 12.5323822978956), .Dim = 3:4, .Dimnames = list(
NULL, c("term", "term", "term", "term")), rowLabels = structure(c("1",
"2", "3"), .Dim = c(3L, 1L), .Dimnames = list(NULL, "approach"), justification = structure(c(NA_character_,
NA_character_, NA_character_), .Dim = c(3L, 1L)), colnamejust = NA_character_, justify = NA, suppress = 0), colLabels = structure(c("pre",
"mean", NA, "sd", "post", "mean", NA, "sd"), .Dim = c(2L, 4L), justification = structure(c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), .Dim = c(2L, 4L)), colnamejust = character(0), justify = NA, suppress = 0), table = value *
v * approach ~ variable2 * result_variable, formats = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("format", "format", "format", "format"))), justification = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("justification", "justification", "justification",
"justification"))), class = "tabular")
structure(c(12L, 12L, 12L), .Names = c("1", "2", "3"))

The only (known ?) way is to re-assign re-ordered:
R> mockup <- data.frame(B=21:23, C=31:33)
R> mockup
B C
1 21 31
2 22 32
3 23 33
R>
Now add column A:
R> mockup[,"A"] <- 1:3
R> mockup
B C A
1 21 31 1
2 22 32 2
3 23 33 3
R>
And reorder:
R> mockup <- mockup[,c("A", "B", "C")]
R> mockup
A B C
1 1 21 31
2 2 22 32
3 3 23 33
R>
Presto. New column at the beginning.

Something like this:
z <- data.frame(approach = gl(3, 12), pre = rnorm(36)*50, post = rnorm(36)*60)
library(tables)
tabular(approach ~ (pre + post) * (mean + sd))
pre post
approach mean sd mean sd
1 -5.431 61.01 3.766 54.76
2 20.408 29.14 -9.261 54.58
3 -7.854 53.55 -30.046 62.41
tabular(approach ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
tabular(approach + 1 ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
All 36 2.374 50.06 -11.847 57.46
For more details see vignette of the tables package.

Related

Simple but not easy merge task

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}

Determine range of time where measurements are not NA

I have a dataset with hundreds of thousands of measurements taken from several subjects. However, the measurements are only partially available, i.e., there may be large stretches with NA. I need to establish up front, for which timespan positive data are available for each subject.
Data:
df
timestamp C B A starttime_ms
1 00:00:00.033 NA NA NA 33
2 00:00:00.064 NA NA NA 64
3 00:00:00.066 NA 0.346 NA 66
4 00:00:00.080 47.876 0.346 22.231 80
5 00:00:00.097 47.876 0.346 22.231 97
6 00:00:00.099 47.876 0.346 NA 99
7 00:00:00.114 47.876 0.346 NA 114
8 00:00:00.130 47.876 0.346 NA 130
9 00:00:00.133 NA 0.346 NA 133
10 00:00:00.147 NA 0.346 NA 147
My (humble) solution so far is (i) to pick out the range of timestamp values that are not NA and to select the first and last such timestamp for each subject individually. Here's the code for subject C:
NotNA_C <- df$timestamp[which(!is.na(df$C))]
range_C <- paste(NotNA_C[1], NotNA_C[length(NotNA_C)], sep = " - ")
range_C
[1] "00:00:00.080" "00:00:00.130"
That doesn't look elegant and, what's more, it needs to be repeated for all other subjects. Is there a more efficient way to establish the range of time for which non-NA values are available for all subjects in one go?
EDIT
I've found a base R solution:
sapply(df[,2:4], function(x)
paste(df$timestamp[which(!is.na(x))][1],
df$timestamp[which(!is.na(x))][length(df$timestamp[which(!is.na(x))])], sep = " - "))
C B A
"00:00:00.080 - 00:00:00.130" "00:00:00.066 - 00:00:00.147" "00:00:00.080 - 00:00:00.097"
but would be interested in other solutions as well!
Reproducible data:
df <- structure(list(timestamp = c("00:00:00.033", "00:00:00.064",
"00:00:00.066", "00:00:00.080", "00:00:00.097", "00:00:00.099",
"00:00:00.114", "00:00:00.130", "00:00:00.133", "00:00:00.147"
), C = c(NA, NA, NA, 47.876, 47.876, 47.876, 47.876, 47.876,
NA, NA), B = c(NA, NA, 0.346, 0.346, 0.346, 0.346,
0.346, 0.346, 0.346, 0.346), A = c(NA, NA, NA, 22.231, 22.231, NA, NA, NA, NA,
NA), starttime_ms = c(33, 64, 66, 80, 97, 99, 114, 130, 133,
147)), row.names = c(NA, 10L), class = "data.frame")
dplyr solution
library(tidyverse)
df <- structure(list(timestamp = c("00:00:00.033", "00:00:00.064",
"00:00:00.066", "00:00:00.080", "00:00:00.097", "00:00:00.099",
"00:00:00.114", "00:00:00.130", "00:00:00.133", "00:00:00.147"
), C = c(NA, NA, NA, 47.876, 47.876, 47.876, 47.876, 47.876,
NA, NA), B = c(NA, NA, 0.346, 0.346, 0.346, 0.346,
0.346, 0.346, 0.346, 0.346), A = c(NA, NA, NA, 22.231, 22.231, NA, NA, NA, NA,
NA), starttime_ms = c(33, 64, 66, 80, 97, 99, 114, 130, 133,
147)), row.names = c(NA, 10L), class = "data.frame")
df %>%
pivot_longer(-c(timestamp, starttime_ms)) %>%
group_by(name) %>%
drop_na() %>%
summarise(min = timestamp %>% min(),
max = timestamp %>% max())
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 3 x 3
#> name min max
#> <chr> <chr> <chr>
#> 1 A 00:00:00.080 00:00:00.097
#> 2 B 00:00:00.066 00:00:00.147
#> 3 C 00:00:00.080 00:00:00.130
Created on 2021-02-15 by the reprex package (v0.3.0)
You could look at the cumsum of differences where there's no NA, coerce them to logical and subset first and last element.
lapply(data.frame(apply(rbind(0, diff(!sapply(df[c("C", "B", "A")], is.na))), 2, cumsum)),
function(x) c(df$timestamp[as.logical(x)][1], rev(df$timestamp[as.logical(x)])[1]))
# $C
# [1] "00:00:00.080" "00:00:00.130"
#
# $B
# [1] "00:00:00.066" "00:00:00.147"
#
# $A
# [1] "00:00:00.080" "00:00:00.097"

Pivot dataset and column names [R]

I have a dataset that I want to pivot.
dataset <- data.frame(date = c("01/01/2020","02/01/2020", "02/01/2020", "03/01/2020")
, camp_type = c("acquisition", "acquisition", "newsletter", "acquisition")
, channel_type = c("email", "direct_mail","email","email")
, sent = c(100, 200, 50, 250)
, open = c(30, NA, 14, 148)
, click = c(14, NA, 1, 100)
)
PLEASE NOTE: I have many more camp_types than the ones displayed in this example.
I want to get one row per day, and the rest of the information in different columns such as the picture below (renaming the columns "sent", "open" and "click" based on "channel_type" and "camp_type").
I have tried something not very elegant, and entirely manual, but I get an error when I rename the variables (code below)
dataset %>%
filter(camp_type == 'Acquisition' & channel_type == 'direct_mail') %>%
rename (dm_acq_sent = sent
, dm_acq_open = open
, dm_acq_click = clicked
)
The problem with this code above is that (once I fix the renaming issue) it will be heavily manual because I have to repeat the same chunk of code several times and needs that someone regularly checks that there are no more combinations of camp_type and channel_type.
Any help / advise will be massively appreciated.
With tidyr you can use pivot_wider:
library(tidyr)
pivot_wider(df, id_cols = date, names_from = c(camp_type, channel_type), values_from = c(sent, open, click))
Output
# A tibble: 3 x 10
date sent_acquisition… sent_acquisition_… sent_newsletter_… open_acquisitio… open_acquisition… open_newsletter… click_acquisiti… click_acquisitio… click_newslette…
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 100 NA NA 30 NA NA 14 NA NA
2 2020-02-01 NA 200 50 NA NA 14 NA NA 1
3 2020-03-01 250 NA NA 148 NA NA 100 NA NA
Data
df <- structure(list(date = structure(c(18262, 18293, 18293, 18322), class = "Date"),
camp_type = structure(c(1L, 1L, 2L, 1L), .Label = c("acquisition",
"newsletter"), class = "factor"), channel_type = structure(c(2L,
1L, 2L, 2L), .Label = c("direct_email", "email"), class = "factor"),
sent = c(100, 200, 50, 250), open = c(30, NA, 14, 148), click = c(14,
NA, 1, 100)), class = "data.frame", row.names = c(NA, -4L
))

Error with a function inside a Lapply() R

I'm having a very strange error in a script that used to work perfectly and I don't know what's the problem. I start creating a very long list with several data frames with the exact number of columns. The list is called lst. Then I want to do a summarise table with means and sd. Here is the script for that:
w1 <- lapply(lst, function(i) t(cbind(Mean = colMeans(i[, c(6,7,8,9)], na.rm = TRUE),
Sds = colSds(as.matrix(i[, c(6,7,8,9)]), na.rm = TRUE),
N = length(i[,2]),
len.max=max(i[,6]))))
The number of the columns are correct. However when I run the script first I get the Debug location and when I stopped I get this error message:
Error in t(cbind(Mean = colMeans(i[, c(6, 7, 8, 9)], na.rm = TRUE), Sds = colSds(as.matrix(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , c(6, 7, 8, 9)) : undefined columns selected
I dont know whats wrong with the function. I try to search in the internet and I saw something about change as,matrix for data.matrix. However this does not make the trick.
Indeed I get the same problem for another function very similar:
a1 <- lapply(lst, function(i) t(cbind(l1 = NROW(which(i[,6]>1)),
l1.05 = NROW(which(i[,6]<=1)) - NROW(which(i[,6]>0.5)),
l05.03 = NROW(which(i[,6]>0.3)) - NROW(which(i[,6]<=0.5)),
l03 = NROW(which(i[,6]<=0.3)))))
With the same outcome:
Error in t(cbind(l1 = NROW(which(i[, 6] > 1)), l1.05 = NROW(which(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , 6) : undefined columns selected
Can someone point me out what is the problem. Do you need some data? Thanks!
I'm working with the last RStudio and with the following packages:
plyr, matrixStats,dplyr
Here is an example of the list:
> lst
[[1]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 284.7560 23.77778 7.952434
2 2 1 2 17000000 17300000 0.4 126.5582 16.00000 5.351171
3 3 1 3 21200000 21500000 0.4 126.5582 40.75000 13.628763
4 4 1 4 45300000 45700000 0.5 158.1978 23.20000 7.759197
5 5 1 5 45900000 46600000 0.8 253.1165 31.12500 10.409699
[[2]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 312.90267 24.44444 4.288499
2 2 1 2 21200000 21500000 0.4 139.06785 38.00000 6.666667
3 3 1 3 32600000 33000000 0.5 173.83482 28.40000 4.982456
4 4 1 4 35800000 36100000 0.4 139.06785 37.25000 6.535088
5 5 1 5 36300000 36300000 0.1 34.76696 22.00000 3.859649
[[3]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 35700000 36500000 0.9 287.4214 12.22222 11.42264
2 2 1 2 45900000 46600000 0.8 255.4857 12.50000 11.68224
3 3 1 3 49400000 50700000 1.4 447.1000 21.78571 20.36048
4 4 1 4 51000000 52000000 1.1 351.2929 16.00000 14.95327
5 5 1 5 52200000 53000000 0.9 287.4214 19.66667 18.38006
dput(lst[1:3])
list(structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(12900000, 1.7e+07, 21200000, 45300000, 45900000),
pos2 = c(13700000, 17300000, 21500000, 45700000, 46600000
), len = c(0.9, 0.4, 0.4, 0.5, 0.8), nsnp = c(284.756031128405,
126.558236057069, 126.558236057069, 158.197795071336, 253.116472114137
), n.ind = c(23.7777777777778, 16, 40.75, 23.2, 31.125),
per.ind = c(7.95243403939056, 5.35117056856187, 13.628762541806,
7.75919732441472, 10.4096989966555)), .Names = c("X", "Chr",
"new", "pos1", "pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"), structure(list(X = 1:5, Chr = c(1L,
1L, 1L, 1L, 1L), new = 1:5, pos1 = c(12900000, 21200000, 32600000,
35800000, 36300000), pos2 = c(13700000, 21500000, 3.3e+07, 36100000,
36300000), len = c(0.9, 0.4, 0.5, 0.4, 0.1), nsnp = c(312.90267141585,
139.0678539626, 173.83481745325, 139.0678539626, 34.76696349065
), n.ind = c(24.4444444444444, 38, 28.4, 37.25, 22), per.ind = c(4.28849902534113,
6.66666666666667, 4.98245614035088, 6.53508771929825, 3.85964912280702
)), .Names = c("X", "Chr", "new", "pos1", "pos2", "len", "nsnp",
"n.ind", "per.ind"), row.names = c(NA, 5L), class = "data.frame"),
structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(35700000, 45900000, 49400000, 5.1e+07, 52200000
), pos2 = c(36500000, 46600000, 50700000, 5.2e+07, 5.3e+07
), len = c(0.9, 0.8, 1.4, 1.1, 0.9), nsnp = c(287.421428571429,
255.485714285714, 447.1, 351.292857142857, 287.421428571429
), n.ind = c(12.2222222222222, 12.5, 21.7857142857143,
16, 19.6666666666667), per.ind = c(11.4226375908619,
11.6822429906542, 20.3604806408545, 14.9532710280374,
18.380062305296)), .Names = c("X", "Chr", "new", "pos1",
"pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"))

Sum Event Data in R

I'm working with some daily rainfall data that spans several years. I want to sum the rainfall on consecutive rainy day to get a rainfall total for that rainfall event. It would also be nice to get a start and stop date and rainfall intensity per event. I'm thinking I could hack something together with aggregate however what I'm thinking of doing in my head seems very bulky. Is there a quick and elegant solution possibly to be found with dplyr,tdyror data.table.
Data
structure(list(Time = structure(c(1353398400, 1353484800, 1353571200,
1353657600, 1353744000, 1353830400, 1353916800, 1354003200, 1354089600,
1354176000, 1354262400, 1354348800, 1354435200, 1354521600, 1354608000,
1354694400, 1354780800, 1354867200, 1354953600, 1355040000, 1355126400,
1355212800, 1355299200, 1355385600, 1355472000, 1355558400, 1355644800,
1355731200, 1355817600, 1355904000, 1355990400, 1356076800, 1356163200,
1356249600, 1356336000, 1356422400, 1356508800, 1356595200, 1356681600,
1356768000, 1356854400, 1356940800, 1357027200, 1357113600, 1357200000,
1357286400, 1357372800, 1357459200, 1357545600, 1357632000, 1357718400
), class = c("POSIXct", "POSIXt"), tzone = ""), inc = c(NA, NA,
NA, NA, NA, NA, NA, 0.11, NA, 0.62, 0.0899999999999999, 0.39,
NA, NA, 0.03, NA, NA, NA, NA, NA, NA, 0.34, NA, NA, NA, NA, 0.0600000000000001,
0.02, NA, NA, NA, 0.29, 0.35, 0.02, 0.27, 0.17, 0.0600000000000001,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.47, NA, NA, NA, 0.0300000000000002
)), .Names = c("Time", "inc"), row.names = 50:100, class = "data.frame")
Desired output
Begin End Days Total Intensity
11/27/2012 11/27/2012 1 0.11 0.11
11/29/2012 12/1/2012 3 1.1 0.366666667
12/4/2012 12/4/2012 1 0.03 0.03
12/11/2012 12/11/2012 1 0.34 0.34
12/16/2012 12/17/2012 2 0.08 0.04
12/21/2012 12/26/2012 6 0.29 0.048333333
1/5/2013 1/5/2013 1 0.47 0.47
1/9/2013 1/9/2013 1 0.03 0.03
data.table::rleid is a convenient function for dealing with consecutive values, assuming your data frame is named df and it has been sorted by Time variable before hand:
library(data.table)
setDT(df)
na.omit(df[,.(Begin = as.Date(first(Time)),
End = as.Date(last(Time)),
Days = as.Date(last(Time)) - as.Date(first(Time)) + 1,
Total = sum(inc), Intensity = mean(inc)),
by = .(id = rleid(is.na(inc)))])
# id Begin End Days Total Intensity
#1: 2 2012-11-27 2012-11-27 1 days 0.11 0.1100000
#2: 4 2012-11-29 2012-12-01 3 days 1.10 0.3666667
#3: 6 2012-12-04 2012-12-04 1 days 0.03 0.0300000
#4: 8 2012-12-11 2012-12-11 1 days 0.34 0.3400000
#5: 10 2012-12-16 2012-12-17 2 days 0.08 0.0400000
#6: 12 2012-12-21 2012-12-26 6 days 1.16 0.1933333 #I think you have some miscalculation here
#7: 14 2013-01-05 2013-01-05 1 days 0.47 0.4700000
#8: 16 2013-01-09 2013-01-09 1 days 0.03 0.0300000
Here is an approach that uses dplyr.
First, some preliminary cleanup: a date variable is needed, not a POSIXct:
library(dplyr)
df2 <- df %>%
mutate(date = as.Date(Time)) %>%
select(-Time)
This computes a data frame with an explicit variable for rain_event:
df3 <- df2 %>%
filter(!is.na(inc)) %>%
mutate(
day_lag = as.numeric(difftime(date, lag(date), units = "days")),
# special case: first rain event
day_lag = ifelse(is.na(day_lag), 1, day_lag),
rain_event = 1 + cumsum(day_lag > 1)
)
> df3
inc date day_lag rain_event
1 0.11 2012-11-27 1 1
2 0.62 2012-11-29 2 2
3 0.09 2012-11-30 1 2
4 0.39 2012-12-01 1 2
5 0.03 2012-12-04 3 3
6 0.34 2012-12-11 7 4
7 0.06 2012-12-16 5 5
8 0.02 2012-12-17 1 5
9 0.29 2012-12-21 4 6
10 0.35 2012-12-22 1 6
11 0.02 2012-12-23 1 6
12 0.27 2012-12-24 1 6
13 0.17 2012-12-25 1 6
14 0.06 2012-12-26 1 6
15 0.47 2013-01-05 10 7
16 0.03 2013-01-09 4 8
Now, summarise by each rain event, computing the metrics you care about:
df3 %>%
group_by(rain_event) %>%
summarise(
begin = min(date),
end = max(date),
days = n(),
total = sum(inc),
intensity = mean(inc)
)
# A tibble: 8 × 6
rain_event begin end days total intensity
<dbl> <date> <date> <int> <dbl> <dbl>
1 1 2012-11-27 2012-11-27 1 0.11 0.1100000
2 2 2012-11-29 2012-12-01 3 1.10 0.3666667
3 3 2012-12-04 2012-12-04 1 0.03 0.0300000
4 4 2012-12-11 2012-12-11 1 0.34 0.3400000
5 5 2012-12-16 2012-12-17 2 0.08 0.0400000
6 6 2012-12-21 2012-12-26 6 1.16 0.1933333
7 7 2013-01-05 2013-01-05 1 0.47 0.4700000
8 8 2013-01-09 2013-01-09 1 0.03 0.0300000
You can append a new column that group rows when they represent a continuous rainy period, then get the statistics you want using dplyr. assuming that your dataframe is called df:
library(dplyr)
rain_period = rep(NA,nrow(df)) #initialize vector
group=1 #initialize group number
for(i in 1:nrow(df)){
if(is.na(df$inc[i])) group = group + 1
else rain_period[i] = group
}
df$group = rain_period
result = dplyr::group_by(df,group)
result = dplyr::summarise(result,
Begin = min(Time),
End = max(Time),
Days = n(),
Total = sum(inc),
Intensity = mean(inc))
Only base packages, and basically using aggregate function. I know it is not the nicest option around. The only problem is with the format of dates (the columns of data frame must be specified one-by-one for the desired date format, otherwise it will be converted to integer):
data1 <- structure(list(Time = structure(c(1353398400, 1353484800, 1353571200,
1353657600, 1353744000, 1353830400, 1353916800, 1354003200, 1354089600,
1354176000, 1354262400, 1354348800, 1354435200, 1354521600, 1354608000,
1354694400, 1354780800, 1354867200, 1354953600, 1355040000, 1355126400,
1355212800, 1355299200, 1355385600, 1355472000, 1355558400, 1355644800,
1355731200, 1355817600, 1355904000, 1355990400, 1356076800, 1356163200,
1356249600, 1356336000, 1356422400, 1356508800, 1356595200, 1356681600,
1356768000, 1356854400, 1356940800, 1357027200, 1357113600, 1357200000,
1357286400, 1357372800, 1357459200, 1357545600, 1357632000, 1357718400
), class = c("POSIXct", "POSIXt"), tzone = ""), inc = c(NA, NA,
NA, NA, NA, NA, NA, 0.11, NA, 0.62, 0.0899999999999999, 0.39,
NA, NA, 0.03, NA, NA, NA, NA, NA, NA, 0.34, NA, NA, NA, NA, 0.0600000000000001,
0.02, NA, NA, NA, 0.29, 0.35, 0.02, 0.27, 0.17, 0.0600000000000001,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.47, NA, NA, NA, 0.0300000000000002
)), .Names = c("Time", "inc"), row.names = 50:100, class = "data.frame")
rainruns <- function(datas = data1) {
incs <- c(NA, datas$inc) # last column
event <- cumsum(is.na(incs[-length(incs)]) & !is.na(incs[-1])) # counter for rain events
datas <- cbind(datas, event) # add events column
datas2 <- datas[!is.na(datas$inc),] # delete na's
summarydata1 <- aggregate(datas2$inc, by = list(datas2$event), # summarize rain data by event
FUN = function(x) c(length(x), sum(x), mean(x)))[[2]]
summarydata2 <- aggregate(as.Date(datas2$Time), by = list(datas2$event), # summarize dates by event
FUN = function(x) c(min(x), max(x)))[[2]]
summarydata <- data.frame(format(as.Date(summarydata2[,1], # combine both, correcting date formats
origin = "1970-01-01"), "%m/%d/%Y"),
format(as.Date(summarydata2[,2],
origin = "1970-01-01"), "%m/%d/%Y"), summarydata1)
names(summarydata) <- c("Begin", "End", "Days", "Total", "Intensity") # update column names
return(summarydata)
}

Resources