Take mean of certain values based on groups in R - r

I have a dataset contaning return values of cryptocurrency ret.daily, small part of it looks like this
Bitcoin Ethereum XRP Bitcoin.Cash Bitcoin.SV ...
2018-01-01 -0.04 0.02 0.04 -0.04 NA
2018-01-02 0.09 0.13 0.04 0.11 NA
2018-01-03 0.01 0.08 0.23 -0.04 NA
...
I have then given each coin into one of 5 groups for each day, based on ceratin values price.groups.daily (these are just the biggest coins i included, the are many other coins, so there are coins in each of the 5 groups)
Bitcoin Ethereum XRP Bitcoin.Cash Bitcoin.SV ...
2018-01-01 5 5 4 5 NA
2018-01-02 5 5 4 5 NA
2018-01-03 5 5 4 5 NA
...
What I then want to do is to take the mean of each group for each day, and make a new matrix, looking like this
1 2 3 4 5
2018-01-01 Mean(groups 1 numbers) Mean(groups 2 numbers) ... ... mean(-0.04, 0.02,-0.04,...)
2018-01-02 Mean(groups 1 numbers) Mean(groups 2 numbers)
2018-01-03 Mean(groups 1 numbers) Mean(groups 2 numbers)
...
When i made the grouping, I did the following (where price.daily is daily price data, which is what i used to sort the data into groups)
col.daily <- seq(1,length(price.daily$Bitcoin))
quantile.daily = sapply(col.daily, function(y) {quantile(x = unlist(price.daily[y,] ), seq(0,1, length=6),na.rm = TRUE )})
quantile.daily.t = t(quantile.daily)
rownames(quantile.daily.t) = rownames(price.daily)
combined.daily = cbind(price.daily, quantile.daily.t)
price.groups.daily = as.data.frame(t(apply(combined.daily, 1, function(x) findInterval(x[1:ncol(price.daily)], x[(1 + ncol(price.daily)):ncol(combined.daily)]))))
colnames(price.groups.daily) = colnames(price.daily)
price.groups.daily[price.groups.daily == 6] = 5
I added the last line like that, since i didnt know how to get around if the biggest values was equal to the end interval in the last group, but this works just fine. I imagine this could also be done using some apply function, i am just not certain how, since before i could use function such as Quantile, and findInterval which did exactly what i wanted to do. Not sure if there is a function that could work in this scenario?
EDIT : Added some of my data using dput(head(price.groups.daily[1:5])) (my data starts in 2014, but i started from 2018 in my example, since most coins didnt exist at that time)
structure(list(Bitcoin = c(5, 5, 5, 5, 5, 5), Ethereum = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), XRP = c(1L, 1L, 1L, 1L, 1L, 2L), Bitcoin.Cash = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), Bitcoin.SV = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), row.names = c("2014-01-01", "2014-01-02",
"2014-01-03", "2014-01-04", "2014-01-05", "2014-01-06"), class = "data.frame")
and for > dput(head(ret.daily[1:5]))
structure(list(Bitcoin = c(0.0201473710988784, 0.048620314369761,
0.0826106401572204, 0.0209460599834816, -0.17281055170073, 0.0495261478685647
), Ethereum = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), XRP = c(-0.0390090806022911, 0.0180075172268452, -0.108767309981219,
0.0184572292482077, -0.111605656954607, 0.0104300601469132),
Bitcoin.Cash = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), Bitcoin.SV = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_)), row.names = c("2014-01-03",
"2014-01-04", "2014-01-05", "2014-01-06", "2014-01-07", "2014-01-08"
), class = "data.frame")

You could have a look at data.table or various tidyverse functions to accomplish that.
Below is an example using data.table:
library(data.table)
library(Hmisc)
## prepare example data
set.seed(1)
dts <- seq.Date(
from = as.Date("2019/1/1"),
to = as.Date("2020/1/1"),
by = 1
)
ret.daily <- matrix(
rnorm(length(dts) * 50, 0, 6),
ncol = 50,
dimnames = list(
dts,
c("Bitcoin", "Ethereum", "XRP", "Bitcoin.Cash", "Bitcoin.SV",
paste0("coin_", sprintf("%02d", seq_len(45))))
))
ret.daily[sample(seq_len(length(ret.daily)), 200 )] <- NA # add some NA's
ret.daily <- data.frame(ret.daily)
## start of summarizations
ret.daily <- melt(data.table(date = as.character(dts), ret.daily), id.vars = "date")
setkey(ret.daily, date, variable)
cuts <- ret.daily[, .(as.list(
Hmisc::cut2(value, g = 6)
)), by = .(date)]
setkey(cuts, date)
# grouping based on daily percentiles (in long format)
ret.daily[, group := unlist(lapply(cuts$V1, as.numeric))][]
#> date variable value group
#> 1: 2019-01-01 Bitcoin -3.7587229 2
#> 2: 2019-01-01 Ethereum 4.0700411 5
#> 3: 2019-01-01 XRP -6.3744503 1
#> 4: 2019-01-01 Bitcoin.Cash -4.5996998 2
#> 5: 2019-01-01 Bitcoin.SV -4.9012655 2
#> ---
#> 18296: 2020-01-01 coin_41 -4.1377852 2
#> 18297: 2020-01-01 coin_42 -0.7649347 3
#> 18298: 2020-01-01 coin_43 0.7698973 4
#> 18299: 2020-01-01 coin_44 -4.6674720 2
#> 18300: 2020-01-01 coin_45 -3.6291231 2
# summarize mean by group and date, and casting the data into wide format
dcast(ret.daily[, .(mean = mean(value, na.rm = TRUE)), by = .(date, group)],
date ~ group, value.var = "mean")
#> date NA 1 2 3 4 5 6
#> 1: 2019-01-01 NA -8.284783 -4.173707 -0.9096477 1.3175870 4.501497 11.123123
#> 2: 2019-01-02 NA -7.379199 -4.502193 -2.1457718 1.1179902 4.207471 8.069149
#> 3: 2019-01-03 NaN -9.070030 -4.708133 -1.8032877 0.9011769 2.699407 7.673678
#> 4: 2019-01-04 NA -7.019294 -2.995686 -0.9035496 1.6644289 4.565588 9.178561
#> 5: 2019-01-05 NA -9.457924 -3.957598 -1.9535285 0.3493898 3.265330 7.396461
#> ---
#> 362: 2019-12-28 NA -9.866193 -4.481655 -2.2775438 1.0612454 3.863716 9.159870
#> 363: 2019-12-29 NA -8.555226 -3.319358 -0.6815004 1.5801415 4.379455 9.354069
#> 364: 2019-12-30 NA -7.430636 -4.011801 -1.3067570 2.2528401 4.805392 10.595387
#> 365: 2019-12-31 NA -7.316091 -2.784448 -0.8047659 0.7121429 3.508579 7.714213
#> 366: 2020-01-01 NaN -8.502224 -4.369027 -1.7029667 0.5042703 3.959396 9.084915
Created on 2020-04-15 by the reprex package (v0.3.0)

Related

Identify intervals where a given vector of dates occurs

it's my first time posting here!
I am super stuck on what I'm sure is an easy thing to do.
I have a dataframe of irregular intervals and a vector of dates. If one of the dates occurs within any of the given intervals, I would like a new column to flag this (as the intervals need to be deleted). Similar to this post but the solution doesn't work due to the irregular intervals. I have over 2000 intervals and 2000 dates.
I can get the dates that occur within the given intervals using the %within% function, but this is no good as I cant find which intervals the dates are occurring in.
I've tried the solutions in this similar post but I have no grouping variable and can't get them to work.
Any suggestions would be incredibly helpful!!! Thank you so much!!
Example raw data (not as lubridate intervals):
>df1
diveno start fin
1 1 2018-08-01 08:20:40 2018-08-01 08:39:20
2 2 2018-08-01 08:40:50 2018-08-01 08:53:40
3 3 2018-08-01 10:01:00 2018-08-01 10:16:30
4 4 2018-08-01 15:45:30 2018-08-01 15:58:20
5 5 2018-08-01 17:06:00 2018-08-01 17:18:20
>df2
date
1 2018-08-01 08:30:00
2 2018-08-01 15:47:00
3 2018-08-02 17:10:00
What I'd like
> df3
diveno start fin dateoccurs
1 1 2018-08-01 08:20:40 2018-08-01 08:39:20 Y
2 2 2018-08-01 08:40:50 2018-08-01 08:53:40 N
3 3 2018-08-01 10:01:00 2018-08-01 10:16:30 N
4 4 2018-08-01 15:45:30 2018-08-01 15:58:20 Y
5 5 2018-08-01 17:06:00 2018-08-01 17:18:20 N
Where the dateoccurs column flags if a date from df2 occurs in given intervals in df1
Code for example data:
df1<-data.frame(diveno=c(1,2,3,4,5),
start=c("2018-08-01 08:20:40","2018-08-01 08:40:50", "2018-08-01 10:01:00","2018-08-01 15:45:30","2018-08-01 17:06:00"),
fin=c("2018-08-01 08:39:20","2018-08-01 08:53:40","2018-08-01 10:16:30","2018-08-01 15:58:20", "2018-08-01 17:18:20"))
df1$start <- as.POSIXct(df1$start,format="%Y-%m-%d %H:%M:%S",tz="CET")
df1$fin <- as.POSIXct(df1$fin,format="%Y-%m-%d %H:%M:%S",tz="CET")
df2<-data.frame(date=c("2018-08-01 08:30:00", "2018-08-01 15:47:00", "2018-08-02 17:10:00"))
df2$date <- as.POSIXct(df2$date,format="%Y-%m-%d %H:%M:%S",tz="CET")
What I need:
df3<-data.frame(diveno=c(1,2,3,4,5),
start=c("2018-08-01 08:20:40","2018-08-01 08:40:50", "2018-08-01 10:01:00","2018-08-01 15:45:30","2018-08-01 17:06:00"),
fin=c("2018-08-01 08:39:20","2018-08-01 08:53:40","2018-08-01 10:16:30","2018-08-01 15:58:20", "2018-08-01 17:18:20"),
dateoccurs=c("Y","N","N","Y","N"))
The closest I've gotten is using an answer from this post
But it returns altered 'fin' times, and when applied to the real massive dataset seems to duplicate values and change the number of 'diveno'!
intervals<-df1
elements<-df2[,1]
library(data.table) #v1.10.0
j<-setDT(intervals)[data.table(elements), on = .(start <= elements, fin >= elements)]
j2<-as.data.frame(j)
na.omit(j2)
UPDATED sample data for df2 that seems to produce false positives?
> dput(df2) structure(list(date = structure(c(1533096000, 1533096300, 1533096600, 1533096900, 1533097200, 1533097500, 1533097800, 1533098100, 1533098400, 1533098700, 1533099000, 1533099300, 1533099600, 1533099900, 1533100200, 1533100500, 1533100800, 1533101100, 1533101400, 1533101700, 1533102000, 1533102300, 1533102600, 1533102900, 1533103200, 1533103500, 1533103800, 1533104100, 1533104400, 1533104700, 1533105000, 1533105300, 1533105600, 1533105900, 1533106200, 1533106500, 1533106800, 1533107100, 1533107400, 1533107700, 1533108000, 1533108300, 1533108600, 1533108900, 1533109200, 1533109500, 1533109800, 1533110100), tzone = "UTC", class = c("POSIXct", "POSIXt")), depth = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), class = "data.frame", row.names = c(NA,
-48L))
Update 2 (sorry!)
within df2 the following times:
30 2018-08-01 06:25:00 NA
31 2018-08-01 06:30:00 NA
32 2018-08-01 06:35:00 NA
Seem to be recognised as falling within the following intervals:
diveno start fin dateoccurs
1 1 2018-08-01 08:20:40 2018-08-01 08:39:20 Y
2 2 2018-08-01 08:40:50 2018-08-01 08:53:40 Y
3 3 2018-08-01 10:01:00 2018-08-01 10:16:30 N
Why might this be happening?
s<-df1[1,2] f<-df1[1,3] int<-interval(s,f)
df2[,1] %within% ir
data.table solution using a non-equi join
library(data.table)
setDT(df1); setDT(df2)
# initialise new column with "N"
df1[, dateoccurs := "N"]
# update join
df1[df2, dateoccurs := "Y", on = .(start <= date, fin >= date)][]
# diveno start fin dateoccurs
# 1: 1 2018-08-01 08:20:40 2018-08-01 08:39:20 Y
# 2: 2 2018-08-01 08:40:50 2018-08-01 08:53:40 N
# 3: 3 2018-08-01 10:01:00 2018-08-01 10:16:30 N
# 4: 4 2018-08-01 15:45:30 2018-08-01 15:58:20 Y
# 5: 5 2018-08-01 17:06:00 2018-08-01 17:18:20 N
You may use outer.
fun <- function(i, j) data.table::between(df2[i, 'date'], df1[j, 'start'], df1[j, 'fin'])
df1$occ <- colSums(outer(seq_len(nrow(df2)), seq_len(nrow(df1)), Vectorize(fun)))
df1
# diveno start fin occ
# 1 1 2018-08-01 08:20:40 2018-08-01 08:39:20 1
# 2 2 2018-08-01 08:40:50 2018-08-01 08:53:40 0
# 3 3 2018-08-01 10:01:00 2018-08-01 10:16:30 0
# 4 4 2018-08-01 15:45:30 2018-08-01 15:58:20 1
# 5 5 2018-08-01 17:06:00 2018-08-01 17:18:20 0
The binary column can easily be wrapped as factor if you like.
df1$occ <- colSums(outer(seq_len(nrow(df2)), seq_len(nrow(df1)), Vectorize(fun))) |>
factor(labels=c("N", "Y"))
df1
# diveno start fin occ
# 1 1 2018-08-01 08:20:40 2018-08-01 08:39:20 Y
# 2 2 2018-08-01 08:40:50 2018-08-01 08:53:40 N
# 3 3 2018-08-01 10:01:00 2018-08-01 10:16:30 N
# 4 4 2018-08-01 15:45:30 2018-08-01 15:58:20 Y
# 5 5 2018-08-01 17:06:00 2018-08-01 17:18:20 N
Data:
df1 <- structure(list(diveno = c(1, 2, 3, 4, 5), start = structure(c(1533104440,
1533105650, 1533110460, 1533131130, 1533135960), class = c("POSIXct",
"POSIXt"), tzone = "CET"), fin = structure(c(1533105560, 1533106420,
1533111390, 1533131900, 1533136700), class = c("POSIXct", "POSIXt"
), tzone = "CET"), occ = structure(c(2L, 1L, 1L, 2L, 1L), levels = c("N",
"Y"), class = "factor")), row.names = c(NA, -5L), class = "data.frame")
df2 <- structure(list(date = structure(c(1533105000, 1533131220, 1533222600
), class = c("POSIXct", "POSIXt"), tzone = "CET")), row.names = c(NA,
-3L), class = "data.frame")

Simple but not easy merge task

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}

R Aggregate over multiple columns

i´m currently working with a large dataframe of 75 columns and round about 9500 rows. This dataframe contains observations for every day from 1995-2019 for several observation points.
Edit: The print from dput(head(df))
> dput(head(df))
structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
The dataframe looks like this sample from it:
date x1 x2 x3 x4 x5 xn year month day
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1995-01-01 50.8 62.2 90.2 60 NA 53.2 1995 1 1
2 1999-08-02 62.6 58.7 NA 72 NA 61.1 1999 8 2
3 2001-09-03 57.2 49.9 70.1 68.4 NA 56.6 2001 9 3
4 2008-05-04 56.6 56.4 75.8 65.5 NA 58.6 2008 5 4
5 2012-04-05 36.8 43.2 83.3 63.2 NA 36.2 2012 4 5
6 2019-12-31 39.1 41.6 98.5 55.9 NA 44.4 2019 12 31
str(df)
tibble [9,131 x 75] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ date : Date[1:9131], format: "1995-01-01" "1995-01-02" ...
$ x1 : num [1:9131] 50.8 62.6 57.2 56.6 36.8 ...
$ x2 : num [1:9131] 62.2 58.7 49.9 56.4 43.2 ...
xn
$ year : num [1:9131] 1995 1995 1995 1995 1995 ...
$ month : num [1:9131] 1 1 1 1 1 1 1 1 1 1 ...
$ day : num [1:9131] 1 2 3 4 5 6 7 8 9 10 ...
My goal is to get for every observation point xn the count of all observations which cross a certain limit per year.
So far i tried to reach this with the Aggregate function.
To get the mean of every year i used the following command:
aggregate(list(df), by=list(year=df$year), mean, na.rm=TRUE)
this works perfect, i get the mean for every year for every observation point.
To get the sum of one station i used the following code
aggregate(list(x1=df$x1), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
which results in this print:
year x1
1 1995 52
2 1996 43
3 1997 44
4 1998 42
5 1999 38
6 2000 76
7 2001 52
8 2002 58
9 2003 110
10 2004 34
11 2005 64
12 2006 46
13 2007 46
14 2008 17
15 2009 41
16 2010 30
17 2011 40
18 2012 47
19 2013 40
20 2014 21
21 2015 56
22 2016 27
23 2017 45
24 2018 22
25 2019 45
So far, so good. I know i could expand the code by adding (..,x2=data$x2, x3=data$x3,..xn) to the list argument in code above. which i tried and they work.
But how do I get them all at once?
I tried the following codes:
aggregate(.~(date, year, month, day), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler: Unerwartete(s) ',' in "aggregate(.~(date,"
aggregate(.~date+year+month+day, by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in as.data.frame.default(data, optional = TRUE) :
cannot coerce class ‘"function"’ to a data.frame
aggregate(. ~ date + year + month + day, data = df,by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...) :
Argumente müssen dieselbe Länge haben
But unfortunately none of them works. Could someone please give me a hint where my mistake is?
Here is an answer that uses base R, and since none of the data in the example data is above 120, we set a criterion of above 70.
data <- structure(
list(
date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"),
x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125),
x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625),
x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875),
x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375),
year = c(1995, 1995, 1995, 1995,
1995, 1995),
month = c(1, 1, 1, 1, 1, 1),
day = c(1, 2, 3,
4, 5, 6)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl",
"data.frame"
))
First, we create a subset of the data that contains all columns containing x, and set them to TRUE or FALSE based on whether the value is greater than 70.
theCols <- data[,colnames(data)[grepl("x",colnames(data))]]
Second, we cbind() the year onto the matrix of logical values.
x_logical <- cbind(year = data$year,as.data.frame(apply(theCols,2,function(x) x > 70)))
Finally, we use aggregate across all columns other than year and sum the columns.
aggregate(x_logical[2:ncol(x_logical)],by = list(x_logical$year),sum,na.rm=TRUE)
...and the output:
Group.1 x1 x2 x3 x4 x5 xn
1 1995 0 0 5 1 0 0
>
Note that by using colnames() to extract the columns that start with x and nrow() in the aggregate() function, we make this a general solution that will handle a varying number of x locations.
Two tidyverse solutions
A tidyverse solution to the same problem is as follows. It includes the following steps.
Use mutate() with across() to create the TRUE / FALSE versions of the x variables. Note that across() requires dplyr 1.0.0, which is currently in development but due for production release the week of May 25th.
Use pivot_longer() to allow us to summarise() multiple measures without a lot of complicated code.
Use pivot_wider() to convert the data back to one column for each x measurement.
...and the code is:
devtools::install_github("tidyverse/dplyr") # needed for across()
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(.,across(starts_with("x"),~if_else(. > 70,TRUE,FALSE))) %>%
select(-year,-month,-day) %>% group_by(date) %>%
pivot_longer(starts_with("x"),names_to = "measure",values_to = "value") %>%
mutate(year = year(date)) %>% group_by(year,measure) %>%
select(-date) %>%
summarise(value = sum(value,na.rm=TRUE)) %>%
pivot_wider(id_cols = year,names_from = "measure",
values_from = value)
...and the output, which matches the Base R solution that I originally posted:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 7
# Groups: year [1]
year x1 x2 x3 x4 x5 xn
<dbl> <int> <int> <int> <int> <int> <int>
1 1995 0 0 5 1 0 0
>
...and here's an edited version of the other answer that will also produce the same results as above. This solution implements pivot_longer() before creating the logical variable for exceeding the threshold, so it does not require the across() function. Also note that since this uses 120 as the threshold value and none of the data meets this threshold, the sums are all 0.
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year,name) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE)) %>%
pivot_wider(id_cols = year,names_from = "name", values_from = sum_120)
...and the output:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 6
# Groups: year [1]
year x1 x2 x3 x4 x5
<dbl> <int> <int> <int> <int> <int>
1 1995 0 0 0 0 0
>
Conclusions
As usual, there are many ways to accomplish a given task in R. Depending on one's preferences, the problem can be solved with Base R or the tidyverse. One of the quirks of the tidyverse is that some operations such as summarise() are much easier to perform on narrow format tidy data than on wide format data. Therefore, it's important to be proficient with tidyr::pivot_longer() and pivot_wider() when working in the tidyverse.
That said, with the production release of dplyr 1.0.0, the team at RStudio continues to add features that facilitate working with wide format data.
This should solve your problem
library(tidyverse)
library(lubridate)
df_example <- structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year(date)) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE))

Transpose dplyr::tbl object

I am using src_postgres to connect and dplyr::tbl function to fetch data from redshift database. I have applied some filters and top function to it using the dplyr itself. Now my data looks as below:
riid day hour
<dbl> <chr> <chr>
1 5542. "THURSDAY " 12
2 5862. "FRIDAY " 15
3 5982. "TUESDAY " 15
4 6022. WEDNESDAY 16
My final output should be as below:
riid MON TUES WED THUR FRI SAT SUN
5542 12
5862 15
5988 15
6022 16
I have tried spread. It throws the below error because of the class type:
Error in UseMethod("spread_") : no applicable method for 'spread_'
applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy',
'tbl')"
Since this is a really big table, I do not want to use dataframe as it takes a longer time.
I was able to use as below:
df_mon <- df2 %>% filter(day == 'MONDAY') %>% mutate(MONDAY = hour) %>% select(riid,MONDAY)
df_tue <- df2 %>% filter(day == 'TUESDAY') %>% mutate(TUESDAY = hour) %>% select(riid,TUESDAY)
df_wed <- df2 %>% filter(day == 'WEDNESDAY') %>% mutate(WEDNESDAY = hour) %>% select(riid,WEDNESDAY)
df_thu <- df2 %>% filter(day == 'THURSDAY') %>% mutate(THURSDAY = hour) %>% select(riid,THURSDAY)
df_fri <- df2 %>% filter(day == 'FRIDAY') %>% mutate(FRIDAY = hour) %>% select(riid,FRIDAY)
Is it possible to write all above in one statement?
Any help to transpose this in a faster manner is really appreciated.
EDIT
Adding the dput of the tbl object:
structure(list(src = structure(list(con = <S4 object of class structure("PostgreSQLConnection", package = "RPostgreSQL")>,
disco = <environment>), .Names = c("con", "disco"), class = c("src_dbi",
"src_sql", "src")), ops = structure(list(name = "select", x = structure(list(
name = "filter", x = structure(list(name = "filter", x = structure(list(
name = "group_by", x = structure(list(x = structure("SELECT riid,day,hour,sum(weightage) AS score FROM\n (SELECT riid,day,hour,\n POWER(2,(cast(datediff (seconds,convert_timezone('UTC','PKT',SYSDATE),TO_DATE(TO_CHAR(event_captured_dt,'mm/dd/yyyy hh24:mi:ss'),'mm/dd/yyyy hh24:mi:ss')) as decimal) / cast(7862400 as decimal))) AS weightage\n FROM (\n SELECT riid,convert_timezone('GMT','PKT',event_captured_dt) AS EVENT_CAPTURED_DT,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'DAY') AS day,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'HH24') AS hour\n FROM Zameen_STO_DATA WHERE EVENT_CAPTURED_DT >= TO_DATE((sysdate -30),'yyyy-mm-dd') and LIST_ID = 4282\n )) group by riid,day,hour", class = c("sql",
"character")), vars = c("riid", "day", "hour", "score"
)), .Names = c("x", "vars"), class = c("op_base_remote",
"op_base", "op")), dots = structure(list(riid = riid,
day = day), .Names = c("riid", "day")), args = structure(list(
add = FALSE), .Names = "add")), .Names = c("name",
"x", "dots", "args"), class = c("op_group_by", "op_single",
"op")), dots = structure(list(~min_rank(desc(~score)) <=
1), .Names = ""), args = list()), .Names = c("name",
"x", "dots", "args"), class = c("op_filter", "op_single",
"op")), dots = structure(list(~row_number() == 1), .Names = ""),
args = list()), .Names = c("name", "x", "dots", "args"), class = c("op_filter",
"op_single", "op")), dots = structure(list(~riid, ~day, ~hour), class = "quosures", .Names = c("",
"", "")), args = list()), .Names = c("name", "x", "dots", "args"
), class = c("op_select", "op_single", "op"))), .Names = c("src",
"ops"), class = c("tbl_dbi", "tbl_sql", "tbl_lazy", "tbl"))
I think what you're looking for is the ability to run the tidyr::spread() function against a remote source, or database. I have a PR for dbplyr that attempts to implement that here: https://github.com/tidyverse/dbplyr/pull/72, you can try it out by using: devtools::install_github("tidyverse/dbplyr", ref = devtools::github_pull(72)).
Use dcast from reshape2 package
> data
# A tibble: 4 x 3
riid day hour
<dbl> <chr> <dbl>
1 1.00 TH 12.0
2 2.00 FR 15.0
3 3.00 TU 15.0
4 4.00 WE 16.0
> dcast(data, riid~day, value.var = "hour")
riid FR TH TU WE
1 1 NA 12 NA NA
2 2 15 NA NA NA
3 3 NA NA 15 NA
4 4 NA NA NA 16
Further if you want to remove NA, then
> z <- dcast(data, riid~day, value.var = "hour")
> z[is.na(z)] <- ""
> z
riid FR TH TU WE
1 1 12
2 2 15
3 3 15
4 4 16
I tried to combine your multiple line attempts into one. Can you try this and let us know the outcome?
library(dplyr)
df %>%
rowwise() %>%
mutate(Mon = ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
Tue = ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
Wed = ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
Thu = ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
Fri = ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
Sat = ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
Sun = ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA)) %>%
select(-day, -hour)
Output is:
riid Mon Tue Wed Thu Fri Sat Sun
1 5542 NA NA NA 12 NA NA NA
2 5862 NA NA NA NA 15 NA NA
3 5982 NA 15 NA NA NA NA NA
4 6022 NA NA 16 NA NA NA NA
Sample data:
# A tibble: 4 x 3
riid day hour
* <dbl> <chr> <int>
1 5542 THURSDAY 12
2 5862 FRIDAY 15
3 5982 TUESDAY 15
4 6022 WEDNESDAY 16
Update:
Can you try below approach using data.table?
library(data.table)
dt <- setDT(df)[, c("Mon","Tue","Wed","Thu","Fri","Sat","Sun") :=
list(ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA))][, !c("day","hour"), with=F]

Return value for minimum in group of columns [duplicate]

This question already has answers here:
Getting the minimum of the rows in a data frame
(3 answers)
Closed 6 years ago.
I have a group of columns, and I need a new column min123 with the minimum value of these columns 123a_1 through 123a_5.
dff <- structure(list(`MCI ID` = c("070405344", "230349820", "260386435","370390587", "380406805", "391169282", "440377986", "750391394","890373764", "910367024"),
`123a_1` = structure(c(16672, 16372,16730, 16688, 16700, 16783, 16709, 17033, 16786, 16675), class = "Date"),
`123a_2` = structure(c(17029, 16422, 17088, 17036, 17057,17140, 17072, 17043, 17141, 17038), class = "Date"),
`123a_3` = structure(c(NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_), class = "Date"),
`123a_4` = structure(c(NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_), class = "Date"),
`123a_5` = structure(c(NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_), class = "Date")),
.Names = c("MCI ID","123a_1", "123a_2", "123a_3", "123a_4", "123a_5"), row.names = c(NA,10L), class = "data.frame")
A base R approach using do.call and pmin:
dff$min123 <- do.call(pmin, c(dff[-1], na.rm = TRUE))
A similar appraoch in dplyr:
library(dplyr)
dff %>%
mutate(min123 = do.call(pmin, c(select(., -1), na.rm = TRUE)))
Or data.table:
library(data.table)
setDT(dff)[, min123 := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = -1]
library(dplyr)
dff %>%
mutate(min123 = pmin(`123a_1`, `123a_2`, `123a_3`, `123a_4`, `123a_5`, na.rm = T))
That's what the function pminis for:
> str(dff)
'data.frame': 10 obs. of 6 variables:
$ MCI ID: chr "070405344" "230349820" "260386435" "370390587" ...
$ 123a_1: Date, format: "2015-08-25" "2014-10-29" "2015-10-22" ...
$ 123a_2: Date, format: "2016-08-16" "2014-12-18" "2016-10-14" ...
$ 123a_3: Date, format: NA NA NA ...
$ 123a_4: Date, format: NA NA NA ...
$ 123a_5: Date, format: NA NA NA ...
> dff$groupmin <- pmin(dff[[2]],dff[[3]],dff[[4]], dff[[5]], dff[[6]], na.rm=TRUE)
> head(dff)
MCI ID 123a_1 123a_2 123a_3 123a_4 123a_5 groupmin
1 070405344 2015-08-25 2016-08-16 <NA> <NA> <NA> 2015-08-25
2 230349820 2014-10-29 2014-12-18 <NA> <NA> <NA> 2014-10-29
3 260386435 2015-10-22 2016-10-14 <NA> <NA> <NA> 2015-10-22
4 370390587 2015-09-10 2016-08-23 <NA> <NA> <NA> 2015-09-10
5 380406805 2015-09-22 2016-09-13 <NA> <NA> <NA> 2015-09-22
6 391169282 2015-12-14 2016-12-05 <NA> <NA> <NA> 2015-12-14

Resources