Following is the dput() output of DT. I would like to sum the values every 3 days, starting from the MIN of DATE, group by ID.
structure(list(ID = c("pqr", "abc", "ort", "kkg", "ssc", "ccv",
"xyz", "xyz", "xyz"), DATE = c("2022-06-07", "2022-06-24", "2022-06-02",
"2022-06-01", "2022-06-16", "2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
), READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200
), READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34,
556.47, 462.75)), class = "data.frame", row.names = c(NA, -9L))
Following is an unsuccessful attempt.
DT$DATE = as.Date(DT$DATE, format = "%Y-%m-%d")
DT1 = DT %>%
group_by(ID, group = cut(as.Date(DT$DATE, format = "%Y-%m-%d"), '3 days')) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = sum(READING_IN),
sum_out = sum(READING_OUT), .groups = 'drop') %>%
select(-group)
Result:
structure(list(ID = c("abc", "ccv", "kkg", "ort", "pqr", "ssc",
"xyz", "xyz", "xyz"), date_range = c("2022-06-24-2022-06-26",
"2022-06-07-2022-06-09", "2022-06-01-2022-06-03", "2022-06-02-2022-06-04",
"2022-06-07-2022-06-09", "2022-06-16-2022-06-18", "2022-06-11-2022-06-13",
"2022-06-13-2022-06-15", "2022-06-27-2022-06-29"), sum_in = c(2800,
500, 500, 600, 150, 1395.94, 800, 179, 200), sum_out = c(2800,
501.4, 500, 600, 150, 1400, 371.34, 556.47, 462.75)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
Desired Output for ID = xyz:
ID
DATE
READING_IN
READING_OUT
xyz
2022-06-11 to 2022-06-13
979
927.81
xyz
2022-06-27 to 2022-06-29
200
462.75
I understand the issue here is the entry on 2022-06-13, an entry which should be aggregated in 2022-06-11 + 2 window. Is there any way to sum the values every 3 days aligned to desired output format?
I believe you were tricked by some group_by() details:
Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate mutate() step before the group_by().
With extra mutate() + group_by() step it seems to behave like described:
library(tibble)
library(dplyr)
DT %>%
mutate(DATE = as.Date(DATE, format = "%Y-%m-%d")) %>%
group_by(ID) %>%
mutate(date_group = cut(DATE, '3 days')) %>%
group_by(ID, date_group) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = num(sum(READING_IN), digits = 2),
sum_out = num(sum(READING_OUT),digits = 2), .groups = 'drop') %>%
select(-date_group)
#> # A tibble: 8 × 4
#> ID date_range sum_in sum_out
#> <chr> <chr> <num:.2!> <num:.2!>
#> 1 abc 2022-06-24 to 2022-06-26 2800.00 2800.00
#> 2 ccv 2022-06-07 to 2022-06-09 500.00 501.40
#> 3 kkg 2022-06-01 to 2022-06-03 500.00 500.00
#> 4 ort 2022-06-02 to 2022-06-04 600.00 600.00
#> 5 pqr 2022-06-07 to 2022-06-09 150.00 150.00
#> 6 ssc 2022-06-16 to 2022-06-18 1395.94 1400.00
#> 7 xyz 2022-06-11 to 2022-06-13 979.00 927.81
#> 8 xyz 2022-06-27 to 2022-06-29 200.00 462.75
Input:
DT <- structure(list(
ID = c(
"pqr", "abc", "ort", "kkg", "ssc", "ccv", "xyz", "xyz", "xyz"
),
DATE = c(
"2022-06-07", "2022-06-24", "2022-06-02", "2022-06-01", "2022-06-16",
"2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
),
READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200),
READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34, 556.47, 462.75)
), class = "data.frame", row.names = c(NA, -9L))
Created on 2023-01-18 with reprex v2.0.2
Related
Using the data provide below, I would like to group my data table by Date and, by column reference (colstoCut), apply the cut function in my code. I can achieve this using dplyr but my actual data is quite large and it's too slow. I've made several attempts based on approaches I've found on SO and elsewhere but nothing I've found so far seems to work.
library(tidyverse)
library(data.table)
cutme <- structure(list(Date = structure(c(18993, 18993, 18993, 18993,
18993, 18994, 18994, 18994, 18994, 18994, 18995, 18995, 18995,
18995, 18995, 18996, 18996, 18996, 18996, 18996, 18997, 18997,
18997, 18997, 18997), class = "Date"), val1 = c(2, 1, 1, 1, 2,
0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 0, 0, 1),
val2 = c(306, 291, 306, 300, 306, 295, 299, 291, 302, 298,
301, 300, 291, 301, 297, 290, 294, 298, 293, 294, 310, 305,
293, 322, 299), val3 = c(278.115915402059, 275.206632766366,
277.843871977486, 274.375934310537, 271.976342200702, 314.694861131995,
322.55015422103, 312.56565930567, 321.31779178896, 310.742656596237,
294.839125866978, 305.946938215211, 317.090018318496, 319.386088532157,
312.323793703966, 309.29514039576, 313.96520162878, 317.360306029457,
310.212544203034, 320.263145398593, 310.432980834677, 296.638028917156,
294.622602772748, 305.922855022984, 308.30568677617)), row.names = c(NA,
-25L), groups = structure(list(`Date,` = structure(c(18993, 18994,
18995, 18996, 18997), class = "Date"), .rows = structure(list(
1:5, 6:10, 11:15, 16:20, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("data.table",
"data.frame"))
# cut these columns [transforms them to bins (factors)]
colstoCut <- colnames(cutme)[-1]
# approach using dplyr (works but too slow on real data)
cutme <- cutme %>%
dplyr::group_by(Date) %>%
dplyr::mutate_at(all_of(colstoCut), ~cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)) %>%
dplyr::ungroup(.)
## several attempts using data.table ##
# no error thrown but columns are not actually cut
cutme[, (colstoCut) := Map(function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = colstoCut]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
in_cols <- colstoCut
out_cols <- paste0(in_cols,"fact")
cutme[, (out_cols) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = in_cols]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(colstoCut, function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2
cutme[, (colstoCut) := lapply(.SD, cut(x = cutme[[colstoCut]], breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
For the given sample dataset with 1 grouping column and 3 value columns to be transformed, the data.table equivalent of OP's dplyr code simply is
library(data.table)
mycut <- \(x) cut(x, unique(quantile(x, probs = seq(0, 1, 0.025))), include.lowest = TRUE)
cutme <- setDT(cutme)[, lapply(.SD, mycut), .SDcols = colstoCut, by = Date]
cutme
Date val1 val2 val3
<Date> <fctr> <fctr> <fctr>
1: 2022-01-01 (1.9,2] (305.4,306] (278.09,278.12]
2: 2022-01-01 [1,1.1] [291,291.9] (275.12,275.21]
3: 2022-01-01 [1,1.1] (305.4,306] (277.58,277.84]
4: 2022-01-01 [1,1.1] (299.1,300] (274.14,274.38]
5: 2022-01-01 (1.9,2] (305.4,306] [271.98,272.22]
6: 2022-01-02 [0,0.1] (294.6,295] (314.5,314.7]
7: 2022-01-02 [0,0.1] (298.9,299] (322.4,322.6]
8: 2022-01-02 (0.9,1] [291,291.4] (312.4,312.6]
9: 2022-01-02 [0,0.1] (301.7,302] (320.7,321.3]
10: 2022-01-02 [0,0.1] (297.7,298] [310.7,310.9]
11: 2022-01-03 [0,0.1] (300.9,301] [294.8,295.9]
12: 2022-01-03 (0.9,1] (299.7,300] (304.8,305.9]
13: 2022-01-03 (0.9,1] [291,291.6] (316.6,317.1]
14: 2022-01-03 (1.9,2] (300.9,301] (319.2,319.4]
15: 2022-01-03 (0.9,1] (296.4,297] (311.7,312.3]
16: 2022-01-04 (0.9,1] [290,290.3] [309.3,309.39]
17: 2022-01-04 (1.9,2] (293.9,294] (313.59,313.97]
18: 2022-01-04 (0.9,1] (297.6,298] (317.02,317.36]
19: 2022-01-04 [0,0.1] (292.7,293] (310.12,310.21]
20: 2022-01-04 [0,0.1] (293.9,294] (319.97,320.26]
21: 2022-01-05 (0.9,1] (309.5,310] (310.2,310.4]
22: 2022-01-05 (0.9,1] (304.4,305] (296.4,296.6]
23: 2022-01-05 [0,0.1] [293,293.6] [294.6,294.8]
24: 2022-01-05 [0,0.1] (320.8,322] (305,305.9]
25: 2022-01-05 (0.9,1] (298.4,299] (308.1,308.3]
Date val1 val2 val3
Based on the comment from akrun the following worked for me:
in_cols <- colstoCut
out_cols <- paste0(in_cols,"_fact")
cutme[, (out_cols) := lapply(.SD, function(.) cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)), by = Date, .SDcols = (in_cols) ]
cutme <- cutme %>%
dplyr::select(Date, all_of(out_cols)
colnames(cutme) <- gsub("_fact","",colnames(cutme))
Because I wasn't able to directly transform the numeric columns into factor columns like I can with dplyr::mutate_at(vars(colstoCut), ~cut(...)) I removed the original (uncut) columns from the data table and selected the new cut columns / renamed using gsub.
I'm sure there's a cleaner way to do this without resorting to selecting / renaming columns but it works for me. I'm happy to accept a better answer once and if it's posted.
I know this question has been asked before on this forum. But my data set is significantly large and I could not make any of the existing solutions work.
Here's a sample dataset.
list(structure(list(id = c("id1", "id2", "id3"), value = c(2,
0, 2), value_2 = c(0, 1, 2)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(id = c("id1", "id2", "id3"), value = c(-1,
0, 0), value_2 = c(1, 0, -3)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(id = c("id1", "id2", "id3"), value = c(-2,
1, 0), value_2 = c(-2, 0, 1)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(id = c("id1", "id2", "id3"), value = c(2,
0, 0), value_2 = c(-2, 0, -1)), class = "data.frame", row.names = c(NA,
-3L)))
I want to calculate the mean of the column 'value' for each 'id' across the list. The result should look like this, where 'value_mean' should be the average of the column 'value' of each id in lists 1, 2, 3 and 4.
structure(list(id = c("id1", "id2", "id3"), value_mean = c(NA,
NA, NA)), class = "data.frame", row.names = c(NA, -3L))
Please note that my real list has 5000 data frames where each data frame has 100,000 rows. I have tried using "bind_rows" and similar functions to convert the list/ to a data frame first, but the data frame becomes too large and R runs out of memory.
Any help would be much appreciated! Thanks!
We may bind the list elements to a single data and then use a group by mean operation
library(dplyr)
bind_rows(lst1) %>%
group_by(id) %>%
summarise(value_mean = mean(value, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 3 x 2
id value_mean
<chr> <dbl>
1 id1 0.25
2 id2 0.25
3 id3 0.5
If the datasets have a the same dimension and the 'id' are in same order, extract the 'value' column, use Reduce to do elementwise + and divide by the length of list
Reduce(`+`, lapply(lst1, `[[`, "value"))/length(lst1)
[1] 0.25 0.25 0.50
Or a more efficient approach is with dapply/t_list from collapse
library(collapse)
dapply(t_list(dapply(lst1, `[[`, "value")), fmean)
V1 V2 V3
0.25 0.25 0.50
You could try to calculate the mean for each data.frame in your list. Weighted by the elements in each data.frame you could calculate the mean for all data.frames:
library(dplyr)
library(purrr)
my_list %>%
map_df(~ .x %>%
group_by(id) %>%
summarise(n = n(),
mean = mean(value, na.rm = TRUE))) %>%
group_by(id) %>%
summarize(mean_value = sum(n * mean)/ sum(n))
This returns
# A tibble: 3 x 2
id mean_value
<chr> <dbl>
1 id1 0.25
2 id2 0.25
3 id3 0.5
Disclaimer: I'm tired right now, don't knwo if this makes any sense.
I have this data in my excel files, and it has so much data to count if I do it in Excel. I want to count how many days in 1 month have a value of more than 50.
I'd like to turn it into something like :
Could someone help me to solve this?
Another option is count with as.yearmon from zoo - filter the rows where 'Value' is greater than 50, then use count after converting to yearmon class with as.yearmon
library(dplyr)
library(zoo)
df %>%
filter(Value > 50) %>%
count(month_year = as.yearmon(Date))
-ouptut
month_year n
1 Jan 2010 3
2 Feb 2010 1
data
df <- structure(list(Date = structure(c(14610, 14611, 14612, 14618,
14618, 14624, 14641), class = "Date"), Value = c(27, 35, 78,
88, 57, 48, 99)), class = "data.frame", row.names = c(NA, -7L
))
Suppose your data is given by
df <- data.frame(Date = as.Date(c("1/1/2010", "1/2/2010", "1/3/2010", "1/9/2010", "1/9/2010", "1/15/2010", "2/1/2010"), "%m/%d/%Y"),
Value = c(27, 35, 78, 88, 57, 48, 99))
To count your specific values you could use
library(dplyr)
df %>%
group_by(month_year = format(Date, "%m-%y")) %>%
summarise(count = sum(Value > 50))
which returns
# A tibble: 2 x 2
month_year count
<chr> <int>
1 01-10 3
2 02-10 1
Note: Your Date column has to contain dates (as in as.Date).
I'm having trouble to join two dfs and I believe it occours due to having two diferente objects. Here is my first df:
head(df1)
ativo
dia BBAS3.SA ITSA4.SA PETR4.SA
2000-03-31 -0.16925030 0.04819535 0.02141427
2000-04-28 -0.04720254 -0.09236691 -0.09300770
2000-05-31 -0.06899136 -0.03948513 -0.02600493
class(df1)
#[1] "table"
This is my second df:
head(df2)
SELIC
mar 2000 18.85
abr 2000 18.62
mai 2000 18.51
class(df2)
#[1] "xts" "zoo"
I tried to merge it, but I got this:
df3 <- merge(df1, df2)
head(df3)
# dia ativo Freq SELIC
#1 2000-03-31 BBAS3.SA -0.16925030 18.85
#2 2000-04-28 BBAS3.SA -0.04720254 18.85
#3 2000-05-31 BBAS3.SA -0.06899136 18.85
And I need to have this:
# ativo
#dia BBAS3.SA ITSA4.SA PETR4.SA SELIC
# 2000-03-31 -0.16925030 0.04819535 0.02141427 18.85
# 2000-04-28 -0.04720254 -0.09236691 -0.09300770 18.62
# 2000-05-31 -0.06899136 -0.03948513 -0.02600493 18.51
Or this:
# ativo
#dia BBAS3.SA ITSA4.SA PETR4.SA SELIC
# mar 2000 -0.16925030 0.04819535 0.02141427 18.85
# abr 2000 -0.04720254 -0.09236691 -0.09300770 18.62
# mai 2000 -0.06899136 -0.03948513 -0.02600493 18.51
Thank you
Data in dput format.
dput(head(df1, 20))
structure(c(-0.16925030161688, -0.0472025393643018, -0.0689913598242851,
0.141311262642138, -0.0315001304458658, -0.0145050751136923,
0.0145050751136922, -0.0678475025589349, -0.0260196172650772,
0, 0.0480340922090001, 0.282644671185294, -0.091151845340713,
0.00551488285057102, 0.173804037328355, -0.097043771073958, 0.201248689088215,
-0.0836092984613955, 0.0146170962005332, -0.193997015642139,
0.0481953506988029, -0.0923669055769145, -0.0394851266704161,
0.149348020700408, 0.0340890968099324, 0.0753055790307224, -0.0315784758872336,
-0.149705665792558, -0.00622308912309538, 0, 0.285026811426058,
-0.037199331538396, -0.108074213231234, -0.0558730868639311,
0.0826859194325133, 0.0812613806626518, 0, 0.00487103707074807,
-0.0600197798057272, -0.0804770853805181, 0.0214142733017596,
-0.0930076959239043, -0.0260049316189392, 0.262819819149602,
-0.151890530559808, 0.162839251757119, -0.0502434129983582, -0.0341645850441082,
-0.066119445873884, -0.0347751105378222, 0, 0.174537138064208,
0.00731111520881221, -0.156186459447952, 0.128301660685536, 0.113194988393825,
-0.102021552584886, 0.00369697765156146, 0.014652098503586, -0.0696419860052779,
-0.019803717564094, 0, 0.0444961639551652, 0.0730668010171692,
-0.0109530737239368, 0.0374915960907066, -0.0941194227900671,
0.0453306426927274, -0.173274373945029, 0.228535671136248, 0,
0.0923733553009261, -0.0400062320449435, 0.0101532578824621,
-0.0204079876556867, 0.0648597665063123, 0.0238683058395199,
-0.00378154015037378, -0.0288204487996149, -0.0157179109799149
), .Dim = c(20L, 4L), .Dimnames = list(dia = c("2000-03-31",
"2000-04-28", "2000-05-31", "2000-06-30", "2000-07-31", "2000-08-31",
"2000-09-29", "2000-10-31", "2000-11-30", "2000-12-28", "2000-12-29",
"2001-01-31", "2001-02-28", "2001-03-30", "2001-04-30", "2001-05-31",
"2001-06-29", "2001-07-31", "2001-08-31", "2001-09-28"), ativo = c("BBAS3.SA",
"ITSA4.SA", "PETR4.SA", "VALE3.SA")), class = "table")
dput(head(df2, 20))
structure(c(18.85, 18.62, 18.51, 18.04, 16.85, 16.52, 16.56,
16.6, 16.51, 16.19, 15.49, 15.2, 15.39, 16.02, 16.43, 17.28,
18.57, 19, 19.06, 19.06), class = c("xts", "zoo"), .indexCLASS = "yearmon", tclass = "yearmon", .indexTZ = "UTC", tzone = "UTC", index = structure(c(951868800,
954547200, 957139200, 959817600, 962409600, 965088000, 967766400,
970358400, 973036800, 975628800, 978307200, 980985600, 983404800,
986083200, 988675200, 991353600, 993945600, 996624000, 999302400,
1001894400), tzone = "UTC", tclass = "yearmon"), .Dim = c(20L,
1L), .Dimnames = list(NULL, "retorno"))
>
The following code does not produce an object of class "table" but it otherwise has the format you want.
library(xts)
library(tidyverse)
df1 %>%
as.data.frame() %>%
mutate(dia = as.Date(dia),
dia = as.yearmon(dia)) %>%
group_by(dia, ativo) %>%
summarise(Freq = sum(Freq)) %>%
ungroup() %>%
pivot_wider(
id_cols = dia,
names_from = ativo,
values_from = Freq
) %>%
left_join(df2 %>% cbind.data.frame(dia = index(.), .), by = "dia")
Another option, this time producing a "table" is
df1 %>%
as.data.frame() %>%
mutate(dia = as.Date(dia),
dia = as.yearmon(dia)) %>%
group_by(dia, ativo) %>%
summarise(Freq = sum(Freq)) %>%
ungroup() %>%
pivot_wider(
id_cols = dia,
names_from = ativo,
values_from = Freq
) %>%
left_join(df2 %>% cbind.data.frame(dia = index(.), .), by = "dia") %>%
pivot_longer(
cols = -dia,
names_to = "ativo",
values_to = "value"
) %>%
xtabs(value ~ dia + ativo, data = .)
I have a dataset with three variables (DateTime, Transmitter, and timediff). The timediff column is the time difference between subsequent detections of a transmitter. I want to know how many times the time differences followed a specific pattern. Here is a sample of my data.
> dput(Example)
structure(list(DateTime = structure(c(1501117802, 1501117805,
1501117853, 1501117857, 1501117913, 1501117917, 1501186253, 1501186254,
1501186363, 1501186365, 1501186541, 1501186542, 1501186550, 1501186590,
1501186591, 1501186644, 1501186646, 1501186737, 1501186739, 1501187151
), class = c("POSIXct", "POSIXt"), tzone = "GMT"), Transmitter = c(30767L,
30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L,
30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L,
30767L, 30767L, 30767L), timediff = c(44, 3, 48, 4, 56, 4, 50,
1, 42, 2, 56, 1, 8, 40, 1, 53, 2, 37, 2, 42)), row.names = c(NA,
20L), class = "data.frame")
So looking at the time difference column, I want to know how many times there is a single timediff < 8seconds, how many times there are two subsequent timediff < 8 seconds, how many times there are three subsequent timediff < 8 seconds, and so on.
Example: In the given dataset, a single timediff <8 seconds happens 7 times while two subsequent timediffs < 8 seconds happens twice.
A "single timediff" = 44, 3 , 48
A "double timediff" = 56, 1, 8, 40
In terms of an output, I'd be looking for something like this...
> dput(output)
structure(list(ID = 30767, Single = 7, Double = 2), class = "data.frame", row.names = c(NA,
-1L))
Thanks for the help!
One dplyr possibility could be:
df %>%
mutate(cond = timediff <= 8) %>%
group_by(rleid = with(rle(cond), rep(seq_along(lengths), lengths))) %>%
add_count(rleid, name = "n_timediff") %>%
filter(cond & row_number() == 1) %>%
ungroup() %>%
count(n_timediff)
n_timediff n
<int> <int>
1 1 8
2 2 1
Considering there could be more values in "Transmitter", you can do (this requires also tidyr):
df %>%
mutate(cond = timediff <= 8) %>%
group_by(Transmitter, rleid = with(rle(cond), rep(seq_along(lengths), lengths))) %>%
add_count(rleid, name = "n_timediff") %>%
filter(cond & row_number() == 1) %>%
ungroup() %>%
group_by(Transmitter) %>%
count(n_timediff) %>%
mutate(n_timediff = paste("timediff", n_timediff, sep = "_")) %>%
spread(n_timediff, n)
Transmitter timediff_1 timediff_2
<int> <int> <int>
1 30767 8 1