Here's the tweak to my previously posted question. Here's my data:
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
user_id date value
27 2016-01-01 15.0
27 2016-01-03 22.4
27 2016-01-05 13.3
27 2016-01-07 21.9
27 2016-01-10 20.6
27 2016-01-14 18.6
27 2016-01-16 16.4
11 2016-01-01 6.8
11 2016-01-03 21.3
11 2016-01-05 19.8
11 2016-01-07 22.0
11 2016-01-10 19.4
11 2016-01-14 17.5
11 2016-01-16 19.3
This time, I'd like to calculate cumulative sum of a value for each user_id for the specified time period'; e.g. last 7, 14 days. The desirable solution would look like this:
user_id date value v_minus7 v_minus14
27 2016-01-01 15.0 15.0 15.0
27 2016-01-03 22.4 37.4 37.4
27 2016-01-05 13.3 50.7 50.7
27 2016-01-07 21.9 72.6 72.6
27 2016-01-10 20.6 78.2 93.2
27 2016-01-14 18.6 61.1 111.8
27 2016-01-16 16.4 55.6 113.2
11 2016-01-01 6.8 6.8 6.8
11 2016-01-03 21.3 28.1 28.1
11 2016-01-05 19.8 47.9 47.9
11 2016-01-07 22.0 69.9 69.9
11 2016-01-10 19.4 82.5 89.3
11 2016-01-14 17.5 58.9 106.8
11 2016-01-16 19.3 56.2 119.3
Ideally, I'd like to use dplyr for this, but other packages would be fine.
logic : first group by user_id, followed by date. Now for each subset of data, we are checking which all dates lie between the current date and 7/14 days back using between() which returns a logical vector.
Based on this logical vector I add the value column
library(data.table)
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
by = c("user_id", "date")][]
# user_id date value v_minus7 v_minus14
#1: 27 2016-01-01 15.0 15.0 15.0
#2: 27 2016-01-03 22.4 37.4 37.4
#3: 27 2016-01-05 13.3 50.7 50.7
#4: 27 2016-01-07 21.9 72.6 72.6
#5: 27 2016-01-10 20.6 78.2 93.2
#6: 27 2016-01-14 18.6 61.1 111.8
#7: 27 2016-01-16 16.4 55.6 113.2
#8: 11 2016-01-01 6.8 6.8 6.8
#9: 11 2016-01-03 21.3 28.1 28.1
#10: 11 2016-01-05 19.8 47.9 47.9
#11: 11 2016-01-07 22.0 69.9 69.9
#12: 11 2016-01-10 19.4 82.5 89.3
#13: 11 2016-01-14 17.5 58.9 106.8
#14: 11 2016-01-16 19.3 56.2 119.3
# from alexis_laz answer.
ff = function(date, value, minus){
cs = cumsum(value)
i = findInterval(date - minus, date, rightmost.closed = TRUE)
w = which(as.logical(i))
i[w] = cs[i[w]]
cs - i
}
setDT(DF2)
DF2[, `:=`( v_minus7 = ff(date, value, 7),
v_minus14 = ff(date, value, 14)), by = c("user_id")]
You can use rollapply from zoo once you fill out the missing dates first:
library(dplyr)
library(zoo)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
all_combinations <- expand.grid(user_id=unique(DF2$user_id),
date=seq(min(DF2$date), max(DF2$date), by="day"))
res <- DF2 %>%
merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
group_by(user_id) %>%
arrange(date) %>%
mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
filter(!is.na(value))
Here is another idea with findInterval to minimize comparisons and operations. First define a function to accomodate the basic part ignoring the grouping. The following function computes the cumulative sum, and subtracts the cumulative sum at each position from the one at its respective past date:
ff = function(date, value, minus)
{
cs = cumsum(value)
i = findInterval(date - minus, date, left.open = TRUE)
w = which(as.logical(i))
i[w] = cs[i[w]]
cs - i
}
And apply it by group:
do.call(rbind,
lapply(split(DF2, DF2$user_id),
function(x) data.frame(x,
minus7 = ff(x$date, x$value, 7),
minus14 = ff(x$date, x$value, 14))))
# user_id date value minus7 minus14
#11.8 11 2016-01-01 6.8 6.8 6.8
#11.9 11 2016-01-03 21.3 28.1 28.1
#11.10 11 2016-01-05 19.8 47.9 47.9
#11.11 11 2016-01-07 22.0 69.9 69.9
#11.12 11 2016-01-10 19.4 82.5 89.3
#11.13 11 2016-01-14 17.5 58.9 106.8
#11.14 11 2016-01-16 19.3 56.2 119.3
#27.1 27 2016-01-01 15.0 15.0 15.0
#27.2 27 2016-01-03 22.4 37.4 37.4
#27.3 27 2016-01-05 13.3 50.7 50.7
#27.4 27 2016-01-07 21.9 72.6 72.6
#27.5 27 2016-01-10 20.6 78.2 93.2
#27.6 27 2016-01-14 18.6 61.1 111.8
#27.7 27 2016-01-16 16.4 55.6 113.2
The above apply-by-group operation can, of course, be replaced by any method prefereable.
Here are some approaches using zoo.
1) Define a function sum_last that given a zoo object takes the sum of the values whose times are within k days of the last day in the series and define a roll function which applies it to an entire series. Then use ave to apply roll to each user_id once for k=7 and once for k=14.
Note that this makes use of the coredata argument to rollapply that was introduced in the most recent version of zoo so be sure you don't have an earlier version.
library(zoo)
# compute sum of values within k time units of last time point
sum_last <- function(z, k) {
tt <- time(z)
sum(z[tt > tail(tt, 1) - k])
}
# given indexes ix run rollapplyr on read.zoo(DF2[ix, -1])
roll <- function(ix, k) {
rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k)
}
nr <- nrow(DF2)
transform(DF2,
v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)),
v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14)))
2) An alternative would be to replace roll with the version shown below. This converts DF2[ix, -1] to "zoo" and merges it with a zero width grid with filled-in gaps. Then rollapply is applied to that and we use window to subset it back to the original times.
roll <- function(ix, k) {
z <- read.zoo(DF2[ix, -1])
g <- zoo(, seq(start(z), end(z), "day"))
m <- merge(z, g, fill = 0)
r <- rollapplyr(m, k, sum, partial = TRUE)
window(r, time(z))
}
Try runner package if you want to calculate on time/date windows. Go to github documentation and check Windows depending on date section.
library(runner)
DF2 %>%
group_by(user_id) %>%
mutate(
v_minus7 = sum_run(value, 7, idx = date),
v_minus14 = sum_run(value, 14, idx = date)
)
Benchmark here
library(data.table)
library(dplyr)
library(zoo)
library(tbrf)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
# example 1
data_table <- function(DF2) {
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
by = c("user_id", "date")][]
}
# example 2
dplyr_grid <- function(DF2) {
all_combinations <- expand.grid(user_id=unique(DF2$user_id),
date=seq(min(DF2$date), max(DF2$date), by="day"))
DF2 %>%
merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
group_by(user_id) %>%
arrange(date) %>%
mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
filter(!is.na(value))
}
# example 3
dplyr_tbrf <- function(DF2) {
DF2 %>%
group_by(user_id) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
arrange(user_id, date) %>%
rename(v_minus7 = sum) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
rename(v_minus14 = sum)
}
# example 4
runner <- function(DF2) {
DF2 %>%
group_by(user_id) %>%
mutate(
v_minus7 = sum_run(value, 7, idx = date),
v_minus14 = sum_run(value, 14, idx = date)
)
}
microbenchmark::microbenchmark(
runner = runner(DF2),
data.table = data_table(DF2),
dplyr = dplyr_tbrf(DF2),
dplyr_tbrf = dplyr_tbrf(DF2),
times = 100L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# runner 1.478331 1.797512 2.350416 2.083680 2.559875 9.181675 100
# data.table 5.432618 5.970619 7.107540 6.424862 7.563405 13.674661 100
# dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078 100
# dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643 100
Here is a new option using dplyr and tbrf
library(tbrf)
library(dplyr)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
DF2 %>%
group_by(user_id) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
arrange(user_id, date) %>%
rename(v_minus7 = sum) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
rename(v_minus14 = sum)
Creates a tibble:
# A tibble: 14 x 5
user_id date value v_minus7 v_minus14
<dbl> <date> <dbl> <dbl> <dbl>
1 11 2016-01-01 6.8 6.8 21.8
2 27 2016-01-01 15 15 21.8
3 11 2016-01-03 21.3 28.1 65.5
4 27 2016-01-03 22.4 37.4 65.5
5 11 2016-01-05 19.8 47.9 98.6
6 27 2016-01-05 13.3 50.7 98.6
7 11 2016-01-07 22 69.9 142.
8 27 2016-01-07 21.9 72.6 142.
9 11 2016-01-10 19.4 82.5 182.
10 27 2016-01-10 20.6 78.2 182.
11 11 2016-01-14 17.5 58.9 219.
12 27 2016-01-14 18.6 61.1 219.
13 11 2016-01-16 19.3 56.2 232.
14 27 2016-01-16 16.4 55.6 232.
I suspect this isn't the fastest solution with larger datasets, but it works well in dplyr chains.
Related
Year A B C D E F
1993-Q1 15.3 5.77 437.02 487.68 97 86.9
1993-Q2 13.5 5.74 455.2 504.5 94.7 85.4
1993-Q3 12.9 5.79 469.42 523.37 92.4 82.9
:::
2021-Q1 18.3 6.48 35680.82 29495.92 182.2 220.4
2021-Q2 7.9 6.46 36940.3 30562.03 180.4 218
Dataset1 <- read.csv('C:/Users/s/Desktop/R/intro/data/Dataset1.csv')
class(Dataset1)
[1] "data.frame"
time_series <- ts(Dataset1, start=1993, frequency = 4)
class(time_series)
[1] "mts" "ts" "matrix"
I don't know how to proceed from there to read my Year column as dates (quaterly) instead of numbers!
Date class does not work well with ts class. It is better to use year and quarter. Using the input shown reproducibly in the Note at the end use read.csv.zoo with yearqtr class and then convert it to ts. The strip.white is probably not needed but we added it just in case.
library(zoo)
z <- read.csv.zoo("Dataset1.csv", FUN = as.yearqtr, format = "%Y-Q%q",
strip.white = TRUE)
tt <- as.ts(z)
tt
## A B C D E F
## 1993 Q1 15.3 5.77 437.02 487.68 97.0 86.9
## 1993 Q2 13.5 5.74 455.20 504.50 94.7 85.4
## 1993 Q3 12.9 5.79 469.42 523.37 92.4 82.9
class(tt)
## [1] "mts" "ts" "matrix"
as.integer(time(tt)) # years
## [1] 1993 1993 1993
cycle(tt) # quarters
## Qtr1 Qtr2 Qtr3
## 1993 1 2 3
as.numeric(time(tt)) # time in years
## [1] 1993.00 1993.25 1993.50
If you did want to use Date class it would be better to use a zoo (or xts) series.
zd <- aggregate(z, as.Date, c)
zd
## A B C D E F
## 1993-01-01 15.3 5.77 437.02 487.68 97.0 86.9
## 1993-04-01 13.5 5.74 455.20 504.50 94.7 85.4
## 1993-07-01 12.9 5.79 469.42 523.37 92.4 82.9
If you want a data frame or xts object then fortify.zoo(z), fortify.zoo(zd), as.xts(z) or as.xts(zd) can be used depending on which one you want.
Note
Lines <- "Year,A,B,C,D,E,F
1993-Q1,15.3,5.77,437.02,487.68,97,86.9
1993-Q2,13.5,5.74,455.2,504.5,94.7,85.4
1993-Q3,12.9,5.79,469.42,523.37,92.4,82.9
"
cat(Lines, file = "Dataset1.csv")
lubridate has really nice year-quarter function yq to convert year quarters to dates.
Dataset1<-structure(list(Year = c("1993-Q1", "1993-Q2", "1993-Q3", "1993-Q4", "1994-Q1", "1994-Q2"), ChinaGDP = c(15.3, 13.5, 12.9, 14.1, 14.1, 13.3), Yuan = c(5.77, 5.74, 5.79, 5.81, 8.72, 8.7), totalcredit = c(437.02, 455.2, 469.42, 521.68, 363.42, 389.01), bankcredit = c(487.68, 504.5, 523.37, 581.83, 403.48, 431.06), creditpercGDP = c(97, 94.7, 92.4, 95.6, 91.9, 90), creditGDPratio = c(86.9, 85.4, 82.9, 85.7, 82.8, 81.2)), row.names = c(NA, 6L), class = "data.frame")
library(lubridate)
library(dplyr)
df_quarter <- Dataset1 %>%
mutate(date=yq(Year)) %>%
relocate(date, .after=Year)
df_quarter
#> Year date ChinaGDP Yuan totalcredit bankcredit creditpercGDP
#> 1 1993-Q1 1993-01-01 15.3 5.77 437.02 487.68 97.0
#> 2 1993-Q2 1993-04-01 13.5 5.74 455.20 504.50 94.7
#> 3 1993-Q3 1993-07-01 12.9 5.79 469.42 523.37 92.4
#> 4 1993-Q4 1993-10-01 14.1 5.81 521.68 581.83 95.6
#> 5 1994-Q1 1994-01-01 14.1 8.72 363.42 403.48 91.9
#> 6 1994-Q2 1994-04-01 13.3 8.70 389.01 431.06 90.0
#> creditGDPratio
#> 1 86.9
#> 2 85.4
#> 3 82.9
#> 4 85.7
#> 5 82.8
#> 6 81.2
Created on 2022-01-15 by the reprex package (v2.0.1)
I have the following dataset of weather conditions in 5 different sites observed in 15-minute intervals over a year, and am developing a shiny app based on it.
site_id date_time latitude longitude ambient_air_tem~ relative_humidy barometric_pres~ average_wind_sp~ particulate_den~
<chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arc1046 2019-11-15 09:15:00 -37.8 145. 14.4 65.4 1007. 7.45 3.9
2 arc1048 2019-11-15 09:15:00 -37.8 145. 14.0 65.5 1006. 6.95 4.4
3 arc1045 2019-11-15 09:15:00 -37.8 145. 14.8 60 1007. 4.93 3.9
4 arc1047 2019-11-15 09:15:00 -37.8 145. 14.4 66.1 1008. 7.85 4.5
5 arc1050 2019-11-15 09:15:00 -37.8 145. 14.1 64.7 1007. 5.8 3.9
6 arc1045 2019-11-15 09:30:00 -37.8 145. 15.4 57.1 1007. 4.43 3.8
7 arc1046 2019-11-15 09:30:00 -37.8 145. 14.8 63.2 1007. 7.6 4.5
8 arc1047 2019-11-15 09:30:00 -37.8 145. 15.2 62.7 1008 7.13 3.6
9 arc1048 2019-11-15 09:30:00 -37.8 145. 14.6 62.2 1007. 7.09 4.7
10 arc1050 2019-11-15 09:30:00 -37.8 145. 14.6 62.5 1007 5.94 3.5
I mapped the 5 sites using leaflet.
leaflet(quarter_hour_readings) %>%
addTiles() %>%
addCircleMarkers(
layerId = ~site_id,
label = ~site_id)
And now want to include radial(spider) plots on each of the markers on the map, upon selecting a single date. For now I have filtered out the data values at a single date, for the following radial plot.
library(fmsb)
dat <- rbind(c(85.00,100.00,2000.00,160.00,999.9,1999.9),
c(-40.00,0.00,10.00,0.00,0.00,0.00),
quarter_hour_readings %>%
filter(date_time == as.POSIXct("2019-11-15 09:15:00",tz="UTC")) %>%
column_to_rownames(var="site_id") %>%
select(c("ambient_air_temperature","relative_humidy","barometric_pressure", "average_wind_speed", "particulate_density_2.5", "particulate_density_10")))
radarchart(dat)
I am however unsure how to include these raidal plots on the respective markers on the map and if there was an easier way to handle this. Although I found this package to insert minicharts on leaflet maps, I wasn't able to find how to add radar plots on a map.
Note. Since you did not provide a reproducible dataset, I take some fake data.
You can follow the approach described here:
m <- leaflet() %>% addTiles()
rand_lng <- function(n = 5) rnorm(n, -93.65, .01)
rand_lat <- function(n = 5) rnorm(n, 42.0285, .01)
rdr_dat <- structure(list(total = c(5, 1, 2.15031008049846, 4.15322054177523,
2.6359076872468),
phys = c(15, 3, 12.3804132539814, 6.6208886719424,
12.4789917719968),
psycho = c(3, 0, 0.5, NA, 3),
social = c(5, 1, 2.82645894121379,
4.82733338139951, 2.81333662476391),
env = c(5, 1, 5, 2.5, 4)),
row.names = c(NA, -5L), class = "data.frame")
makePlotURI <- function(expr, width, height, ...) {
pngFile <- plotPNG(function() { expr }, width = width, height = height, ...)
on.exit(unlink(pngFile))
base64 <- httpuv::rawToBase64(readBin(pngFile, raw(1), file.size(pngFile)))
paste0("data:image/png;base64,", base64)
}
set.seed(1)
plots <- data.frame(lat = rand_lat(),
lng = rand_lng(),
radar = rep(makePlotURI({radarchart(rdr_dat)}, 200, 200, bg = "white"), 5))
m %>% addMarkers(icon = ~ icons(radar), data = plots)
I like to use the advanted of BatchgetSymbols.
Any advice how I can best manipulate the output to receive the format below?
symbols_RP <- c('VDNR.L','VEUD.L','VDEM.L','IDTL.L','IEMB.L','GLRE.L','IGLN.L')
#Setting price download date range
from_date <- as.Date('2019-01-01')
to_date <- as.Date(Sys.Date())
get.symbol.adjclose <- function(ticker) {
l.out <- BatchGetSymbols(symbols_RP, first.date = from_date, last.date = to_date, do.cache=TRUE, freq.data = "daily", do.complete.data = TRUE, do.fill.missing.prices = TRUE, be.quiet = FALSE)
return(l.out$df.tickers)
}
prices <- get.symbol.adjclose(symbols_RP)
Output Batchgetsymbols
$df.tickers
price.open price.high price.low price.close volume price.adjusted ref.date ticker ret.adjusted.prices ret.closing.prices
1 60.6000 61.7950 60.4000 61.5475 4717 60.59111 2019-01-02 VDNR.L NA NA
2 60.7200 60.9000 60.5500 60.6650 22015 59.72233 2019-01-03 VDNR.L -1.433838e-02 -1.433852e-02
3 60.9050 60.9500 60.9050 61.8875 1010 60.92583 2019-01-04 VDNR.L 2.015164e-02 2.015165e-02
4 62.3450 62.7850 62.3400 62.7300 820 61.75524 2019-01-07 VDNR.L 1.361339e-02 1.361340e-02
Desired output below:
VTI PUTW VEA VWO TLT VNQI GLD EMB UST FTAL
2019-01-02 124.6962 25.18981 35.72355 36.92347 118.6449 48.25209 121.33 97.70655 55.18464 45.76
2019-01-03 121.8065 25.05184 35.43429 36.34457 119.9950 48.32627 122.43 98.12026 56.01122 45.54
2019-01-04 125.8384 25.39677 36.52383 37.49271 118.6061 49.38329 121.44 98.86311 55.10592 46.63
2019-01-07 127.1075 25.57416 36.63954 37.56989 118.2564 49.67072 121.86 99.28625 54.81071 46.54
2019-01-08 128.4157 25.61358 36.89987 37.78215 117.9456 50.06015 121.53 99.21103 54.54502 47.05
2019-01-09 129.0210 25.56431 37.35305 38.33209 117.7610 50.39395 122.31 99.38966 54.56470 47.29
as I know from other languages, I could use for loop, but I know there is faster ways in r.
Maybe one could hint me the r-way?
Improved version:
get.symbol.adjclose <- function(ticker) {
l.out <- BatchGetSymbols(symbols_RP, first.date = from_date, last.date = to_date, do.cache=TRUE, freq.data = "daily", do.complete.data = TRUE, do.fill.missing.prices = TRUE, be.quiet = FALSE)
return(as.data.frame(l.out$df.tickers[c("ticker","ref.date","price.open","price.high","price.low","price.close","volume","price.adjusted")]))
}
Using dplyr and tidyr. I'm selecting price.adjusted, but you can use any of the prices you need.
library(dplyr)
library(tidyr)
prices %>%
select(ref.date, ticker, price.adjusted) %>% # select columns before pivot_wider
pivot_wider(names_from = ticker, values_from = price.adjusted)
# A tibble: 352 x 7
ref.date GLRE.L IDTL.L IGLN.L VDEM.L VDNR.L VEUD.L
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2019-01-02 NA NA 25.2 51.0 60.6 30.2
2 2019-01-03 32.2 4.50 25.3 50.3 59.7 30.1
3 2019-01-04 32.6 4.47 25.2 51.7 60.9 30.9
4 2019-01-07 32.8 4.47 25.3 51.8 61.8 31.0
5 2019-01-08 32.8 4.44 25.2 51.9 62.0 31.3
6 2019-01-09 33.3 4.43 25.3 53.0 62.7 31.7
7 2019-01-10 33.5 4.41 25.3 53.2 62.7 31.7
8 2019-01-11 33.8 4.40 25.3 53.1 62.8 31.6
9 2019-01-14 33.8 4.41 25.3 52.7 62.7 31.4
10 2019-01-15 34.0 4.41 25.3 53.1 63.1 31.4
# ... with 342 more rows
Note from BatchGetSymbols :
IEMB.L OUT: not enough data (thresh.bad.data = 75%)
I have a data.frame that looks like this
DATE MEAN SUM MAX MIN SAISON JAHR
1 1995-09-01 00:00:00 2.370833 56.9 7.4 0 S 1995
2 1995-09-01 01:00:00 2.225000 53.4 7.4 0 S 1995
3 1995-09-01 02:00:00 2.091667 50.2 7.4 0 S 1995
4 1995-09-01 03:00:00 1.929167 46.3 7.4 0 S 1995
5 1995-09-01 04:00:00 1.745833 41.9 7.4 0 S 1995
6 1995-09-01 05:00:00 1.558333 37.4 7.4 0 S 1995
....
With the dplyr package I am able to extract the highest SUM for every SAISON and JAHR:
group_by(.data = dataframe,JAHR,SAISON)
summarise(gJahrSAISON_24, hoechsterNiederschlag = max(SUM))
Do you have any idea how to extract the ten(!) highest sums for every JAHR and SAISON?
You can use slice with arrange
library(dplyr)
df1 %>%
group_by(JAHR, SAISON) %>%
arrange(desc(SUM)) %>%
slice(1:10)
Or filter with min_rank/dense_rank
df1 %>%
group_by(JAHR, SAISON) %>%
filter(dense_rank(SUM)<=10)
Similar options using data.table are
library(data.table)#v1.9.5+
setDT(df1)[order(-SUM), .SD[1:10], by = .(JAHR, SAISON)]
Or
setDT(df1)[, .SD[frank(SUM, ties.method='first') <=10], by = .(JAHR, SAISON)]
Or using sqldf
library(sqldf)
sqldf('select * from df1 i
where rowid in
(select rowid from df1
where JAHR = i.JAHR and SAISON=i.SAISON
order by SUM desc
limit 10)
order by i.JAHR, i.SAISON, i.SUM desc')
Or with base R
df1[with(df1, ave(SUM, SAISON, JAHR, FUN=function(x)
rank(-x, ties.method='first'))<=10),]
In R I have a data.frame that has several variables that have been measured monthly over several years. I would like to derive the monthly average (using all years) for each variable. Ideally these new variables would all be together in a new data.frame (carrying over the ID), below I am simply adding the new variable to the data.frame. The only way I know how to do this at the moment (below) seems quite laborious, and I was hoping there might be a smarter way to do this in R, that would not require typing out each month and variable as I did below.
# Example data.frame with only two years, two month, and two variables
# In the real data set there are always 12 months per year
# and there are at least four variables
df<- structure(list(ID = 1:4, ABC.M1Y2001 = c(10, 12.3, 45, 89), ABC.M2Y2001 = c(11.1,
34, 67.7, -15.6), ABC.M1Y2002 = c(-11.1, 9, 34, 56.5), ABC.M2Y2002 = c(12L,
13L, 11L, 21L), DEF.M1Y2001 = c(14L, 14L, 14L, 16L), DEF.M2Y2001 = c(15L,
15L, 15L, 12L), DEF.M1Y2002 = c(5, 12, 23.5, 34), DEF.M2Y2002 = c(6L,
34L, 61L, 56L)), .Names = c("ID", "ABC.M1Y2001", "ABC.M2Y2001","ABC.M1Y2002",
"ABC.M2Y2002", "DEF.M1Y2001", "DEF.M2Y2001", "DEF.M1Y2002",
"DEF.M2Y2002"), class = "data.frame", row.names = c(NA, -4L))
# list variable to average for ABC Month 1 across years
ABC.M1.names <- c("ABC.M1Y2001", "ABC.M1Y2002")
df <- transform(df, ABC.M1 = rowMeans(df[,ABC.M1.names], na.rm = TRUE))
# list variable to average for ABC Month 2 across years
ABC.M2.names <- c("ABC.M2Y2001", "ABC.M2Y2002")
df <- transform(df, ABC.M2 = rowMeans(df[,ABC.M2.names], na.rm = TRUE))
# and so forth for ABC
# ...
# list variables to average for DEF Month 1 across years
DEF.M1.names <- c("DEF.M1Y2001", "DEF.M1Y2002")
df <- transform(df, DEF.M1 = rowMeans(df[,DEF.M1.names], na.rm = TRUE))
# and so forth for DEF
# ...
Here's a solution using data.table development version v1.8.11 (which has melt and cast methods implemented for data.table):
require(data.table)
require(reshape2) # melt/cast builds on S3 generic from reshape2
dt <- data.table(df) # where df is your data.frame
dcast.data.table(melt(dt, id="ID")[, sum(value)/.N, list(ID,
gsub("Y.*$", "", variable))], ID ~ gsub)
ID ABC.M1 ABC.M2 DEF.M1 DEF.M2
1: 1 -0.55 11.55 9.50 10.5
2: 2 10.65 23.50 13.00 24.5
3: 3 39.50 39.35 18.75 38.0
4: 4 72.75 2.70 25.00 34.0
You can just cbind this to your original data.
Note that sum is a primitive where as mean is S3 generic. Therefore, using sum(.)/length(.) is better (as if there are too many groupings, dispatching the right method with mean for every group could be quite a time-consuming operation). .N is a special variable in data.table that directly gives you the length of the group.
Here is a solution using reshape2 that is more automated when you have lots of data and uses regular expressions to extract the variable name and the month. This solution will give you a nice summary table.
# Load required package
require(reshape2)
# Melt your wide data into long format
mdf <- melt(df , id = "ID" )
# Extract relevant variable names from the variable colum
mdf$Month <- gsub( "^.*\\.(M[0-9]{1,2}).*$" , "\\1" , mdf$variable )
mdf$Var <- gsub( "^(.*)\\..*" , "\\1" , mdf$variable )
# Aggregate by month and variable
dcast( mdf , Var ~ Month , mean )
# Var M1 M2
#1 ABC 30.5875 19.275
#2 DEF 16.5625 26.750
Or to be compatible with the other solutions, and return the table by ID as well...
dcast( mdf , ID ~ Var + Month , mean )
# ID ABC_M1 ABC_M2 DEF_M1 DEF_M2
#1 1 -0.55 11.55 9.50 10.5
#2 2 10.65 23.50 13.00 24.5
#3 3 39.50 39.35 18.75 38.0
#4 4 72.75 2.70 25.00 34.0
This is pretty straight forward in base R.
mean.names <- split(names(df)[-1], gsub('Y[0-9]{4}$', '', names(df)[-1]))
means <- lapply(mean.names, function(x) rowMeans(df[, x], na.rm = TRUE))
data.frame(df, means)
This gives you your original data.frame with the following four columns at the end:
ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 -0.55 11.55 9.50 10.5
2 10.65 23.50 13.00 24.5
3 39.50 39.35 18.75 38.0
4 72.75 2.70 25.00 34.0
You can use Reshape from package {splitstackshape} and then use plyr package or data.table or base R to perform mean.
library(splitstackshape) # Reshape
library(plyr) # ddply
kk<-Reshape(df,id.vars="ID",var.stubs=c("ABC.M1","ABC.M2","DEF.M1","DEF.M2"),sep="")
> kk
ID AE DB time ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 1 NA NA 1 10.0 11.1 14.0 15
2 2 NA NA 1 12.3 34.0 14.0 15
3 3 NA NA 1 45.0 67.7 14.0 15
4 4 NA NA 1 89.0 -15.6 16.0 12
5 1 NA NA 2 -11.1 12.0 5.0 6
6 2 NA NA 2 9.0 13.0 12.0 34
7 3 NA NA 2 34.0 11.0 23.5 61
8 4 NA NA 2 56.5 21.0 34.0 56
ddply(kk[,c(1,5:8)],.(ID),colwise(mean))
ID ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 1 -0.55 11.55 9.50 10.5
2 2 10.65 23.50 13.00 24.5
3 3 39.50 39.35 18.75 38.0
4 4 72.75 2.70 25.00 34.0