Finding mean of variable across each month/year - r

I have a dataset that looks similar to this:
> dput(df)
structure(list(Date = c("3/23/21", "4/11/22", "6/30/22"), Banana_wasted = c(4L,
2L, 5L), Apple_wasted = c(6L, 0L, 3L), Orange_wasted = c(1L,
4L, 1L), Banana_ordered = c(5L, 7L, 7L), Apple_Ordered = c(9L,
8L, 9L), Orange_ordered = c(5L, 6L, 6L), Banana_eaten = c(5L,
5L, 6L), Apple_eaten = c(7L, 7L, 4L), Orange_eaten = c(8L, 8L,
8L)), class = "data.frame", row.names = c(NA, -3L))
I want to find the % of fruit wasted per month/year (in relation to how many fruits were ordered).
it should be:
(Banana_wasted+Apple_wasted+Orange_wasted) / (Banana_ordered + Apple_ordered+ Orange_ordered)
So, for 3/21, it should be:
(4+6+1/5+9+5)*100 = 57.9%
I would like to do this for every month of the year.

library(tidyverse)
df %>%
group_by(Date = floor_date(mdy(Date), "month")) %>%
summarise(
wasted = sum(across(contains("wasted"))) / sum(across(contains("ordered"))),
wasted_eaten = sum(across(contains("wasted"))) / sum(across(contains("eaten")))
)
# A tibble: 3 x 3
Date wasted wasted_eaten
<date> <dbl> <dbl>
1 2021-03-01 0.579 0.579
2 2022-04-01 0.286 0.314
3 2022-06-01 0.409 0.523

library(dplyr)
library(lubridate)
df %>%
mutate(Date = as.Date(Date, format = "%m/%d/%y"),
pct_wasted = (Banana_wasted + Apple_wasted + Orange_wasted) / (Banana_ordered + Apple_Ordered + Orange_ordered) * 100) %>%
group_by(year = year(Date), month = month(Date)) %>%
summarize(avg_pct_wasted = mean(pct_wasted))
#> # A tibble: 3 × 3
#> # Groups: year [2]
#> year month avg_pct_wasted
#> <dbl> <dbl> <dbl>
#> 1 2021 3 57.9
#> 2 2022 4 28.6
#> 3 2022 6 40.9
Created on 2023-02-06 with reprex v2.0.2

Pivot longer to get single wasted and ordered columns across all fruits; use lubridate::floor_date() and mdy() to get months from Date; group by month; then sum and divide to get your percentages:
library(dplyr)
library(tidyr)
library(lubridate)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>% # for consistent capitalization
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = floor_date(mdy(Date), "month")) %>%
summarize(pct_wasted = sum(wasted) / sum(ordered)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <date> <dbl>
# 1 2021-03-01 0.579
# 2 2022-04-01 0.286
# 3 2022-06-01 0.409
If you prefer character labels, use strftime() instead of floor_date(), and scales::percent() for the percentages:
library(scales)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>%
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = strftime(mdy(Date), "%B %Y")) %>%
summarize(pct_wasted = percent(sum(wasted) / sum(ordered), accuracy = 0.1)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <chr> <chr>
# 1 April 2022 28.6%
# 2 June 2022 40.9%
# 3 March 2021 57.9%

Related

How can we check if any 2 intervals of a unique ID overlaps?

I have data of patient prescription of oral DM drugs, i.e. DPP4 and SU, and would like to find out if patients had taken the drugs concurrently (i.e. whether there are overlapping intervals for DPP4 and SU within the same patient ID).
Sample data:
ID DRUG START END
1 1 DPP4 2020-01-01 2020-01-20
2 1 DPP4 2020-03-01 2020-04-01
3 1 SU 2020-03-15 2020-04-30
4 2 SU 2020-10-01 2020-10-31
5 2 DPP4 2020-12-01 2020-12-31
In the sample data above,
ID == 1, patient had DPP4 and SU concurrently from 2020-03-15 to 2020-04-01.
ID == 2, patient had consumed both medications at separate intervals.
I thought of splitting the data into 2, one for DPP4 and another for SU. Then, do a full join, and compare each DPP4 interval with each SU interval. This may be okay for small data, but if a patient has like 5 rows for DPP4 and another 5 for SU, we will have 25 comparisons, which may not be efficient. Add that with 10000+ patients.
I am not sure how to do it.
New data:
Hope to have a new df that looks like this. Or anything that is tidy.
ID DRUG START END
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
Data Code:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L), DRUG = c("DPP4", "DPP4",
"SU", "SU", "DPP4"), START = structure(c(18262, 18322, 18336,
18536, 18597), class = "Date"), END = structure(c(18281, 18353,
18382, 18566, 18627), class = "Date")), class = "data.frame", row.names = c(NA,
-5L))
df_new <- structure(list(ID = 1:2, DRUG = c("DPP4-SU", NA), START = structure(c(18336,
NA), class = "Date"), END = structure(c(18353, NA), class = "Date")), class = "data.frame", row.names = c(NA,
-2L))
Edit:
I think from the sample data I gave, it may seem that there can only be 1 intersecting interval. But there may be more. So, I think this would be better data to illustrate.
structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004,
17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate",
"Date")), END = structure(c(17039, 17405, 17405, 17521, 17625,
17669, 17711), class = c("IDate", "Date")), duration = c(35L,
22L, 22L, 103L, 188L, 20L, 35L), INDEX = c(1L, 0L, 0L, 0L, 0L,
0L, 0L)), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"))
It's way more complicated than dear #AnoushiravanR's but as an alternative you could try
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
full_join(x = ., y = ., by = "ID") %>%
# filter(DRUG.x != DRUG.y | START.x != START.y | END.x != END.y) %>%
filter(DRUG.x != DRUG.y) %>%
group_by(ID, intersection = intersect(interval(START.x, END.x), interval(START.y, END.y))) %>%
drop_na(intersection) %>%
filter(START.x == first(START.x)) %>%
summarise(DRUG = paste(DRUG.x, DRUG.y, sep = "-"),
START = as_date(int_start(intersection)),
END = as_date(int_end(intersection)),
.groups = "drop") %>%
select(-intersection)
returning
# A tibble: 1 x 4
ID DRUG START END
<int> <chr> <date> <date>
1 1 DPP4-SU 2020-03-15 2020-04-01
Edit: Changed the filter condition. The former one was flawed.
Updated Solution
I have made considerable modifications based on the newly provided data set. This time I first created interval for each START and END pair and extract the intersecting period between them. As dear Martin nicely made use of them we could use lubridate::int_start and lubridate::int_end to extract the START and END date of each interval:
library(dplyr)
library(lubridate)
library(purrr)
library(tidyr)
df %>%
group_by(ID) %>%
arrange(START, END) %>%
mutate(int = interval(START, END),
is_over = c(NA, map2(int[-n()], int[-1],
~ intersect(.x, .y)))) %>%
unnest(cols = c(is_over)) %>%
select(-int) %>%
filter(!is.na(is_over) | !is.na(lead(is_over))) %>%
select(!c(START, END)) %>%
mutate(grp = cumsum(is.na(is_over))) %>%
group_by(grp) %>%
summarise(ID = first(ID),
DRUG = paste0(DRUG, collapse = "-"),
is_over = na.omit(is_over)) %>%
mutate(START = int_start(is_over),
END = int_end(is_over)) %>%
select(!is_over)
# A tibble: 1 x 5
grp ID DRUG START END
<int> <int> <chr> <dttm> <dttm>
1 1 1 DPP4-SU 2020-03-15 00:00:00 2020-04-01 00:00:00
Second data set:
# A tibble: 2 x 5
grp ID DRUG START END
<int> <dbl> <chr> <dttm> <dttm>
1 1 3 DPP4-SU 2017-08-05 00:00:00 2017-08-27 00:00:00
2 2 3 SU-DPP4 2017-09-28 00:00:00 2017-12-21 00:00:00
Update
As per updated df
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c(
"DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"
), START = structure(c(
17004,
17383, 17383, 17418, 17437, 17649, 17676
), class = c(
"IDate",
"Date"
)), END = structure(c(
17039, 17405, 17405, 17521, 17625,
17669, 17711
), class = c("IDate", "Date")), duration = c(
35L,
22L, 22L, 103L, 188L, 20L, 35L
), INDEX = c(
1L, 0L, 0L, 0L, 0L,
0L, 0L
)), row.names = c(NA, -7L), class = c(
"tbl_df", "tbl",
"data.frame"
))
we obtain
> dfnew
ID DRUG start end
3.3 3 DPP4-SU 2017-08-05 2017-08-27
3.7 3 SU-DPP4 2017-09-28 2017-12-21
A base R option (not as fancy as the answers by #Anoushiravan R or #Martin Gal)
f <- function(d) {
d <- d[with(d, order(START, END)), ]
idx <- subset(
data.frame(which((u <- with(d, outer(START, END, `<`))) & t(u), arr.ind = TRUE)),
row > col
)
if (nrow(idx) == 0) {
return(data.frame(ID = unique(d$ID), DRUG = NA, start = NA, end = NA))
}
with(
d,
do.call(rbind,
apply(
idx,
1,
FUN = function(v) {
data.frame(
ID = ID[v["row"]],
DRUG = paste0(DRUG[sort(unlist(v))], collapse = "-"),
start = START[v["row"]],
end = END[v["col"]]
)
}
))
)
}
dfnew <- do.call(rbind, Map(f, split(df, ~ID)))
gives
> dfnew
ID DRUG start end
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
You may use a slightly different approach from the above answers, but this will give you results in format different than required. Obviously, these can be joined to get expected results. You may try this
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df
#> # A tibble: 7 x 4
#> ID DRUG START END
#> <dbl> <chr> <date> <date>
#> 1 3 DPP4 2016-07-22 2016-08-26
#> 2 3 DPP4 2017-08-05 2017-08-27
#> 3 3 SU 2017-08-05 2017-08-27
#> 4 3 SU 2017-09-09 2017-12-21
#> 5 3 DPP4 2017-09-28 2018-04-04
#> 6 3 DPP4 2018-04-28 2018-05-18
#> 7 3 DPP4 2018-05-25 2018-06-29
library(tidyverse)
df %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END'))
#> # A tibble: 4 x 6
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap
#> <dbl> <chr> <int> <ord> <date> <dbl>
#> 1 3 SU 3 START 2017-08-05 2
#> 2 3 DPP4 2 END 2017-08-27 1
#> 3 3 DPP4 5 START 2017-09-28 2
#> 4 3 SU 4 END 2017-12-21 1
on originally provided data
# A tibble: 2 x 6
# Groups: ID [1]
ID DRUG treatment_id event dates overlap
<int> <chr> <int> <ord> <date> <dbl>
1 1 SU 3 START 2020-03-15 2
2 1 DPP4 2 END 2020-04-01 1
For transforming/getting results in original shape, you may filter overlapping rows
library(tidyverse)
df_new <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df_new %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END')) %>%
left_join(df_new %>% mutate(treatment_id = row_number()), by = c('ID', 'DRUG', 'treatment_id'))
#> # A tibble: 4 x 8
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap START END
#> <dbl> <chr> <int> <ord> <date> <dbl> <date> <date>
#> 1 3 SU 3 START 2017-08-05 2 2017-08-05 2017-08-27
#> 2 3 DPP4 2 END 2017-08-27 1 2017-08-05 2017-08-27
#> 3 3 DPP4 5 START 2017-09-28 2 2017-09-28 2018-04-04
#> 4 3 SU 4 END 2017-12-21 1 2017-09-09 2017-12-21
Created on 2021-08-10 by the reprex package (v2.0.0)

Long to wider format

I have lab records of 30,000 unique ID's. I need to convert my data from long to wider format for each ID and TEST_DATE related to that unique ID.
Example for one ID :
I need to convert this to a wider format like this:
I have a dataset with 30,000 ID's and I need to do this for each ID. The ID with the maximum number of tests will determine our number of columns.
I will appreciate any ideas that you might have to solve this problem! Thank you
Try this:
library(dplyr)
library(tidyr)
#Code
new <- df %>%
group_by(ACCT,TEST_DATE) %>%
summarise(RESULT=round(mean(RESULT,na.rm=T),2)) %>%
ungroup() %>%
mutate(across(-ACCT,~as.character(.))) %>%
pivot_longer(-ACCT) %>%
group_by(ACCT,name) %>%
mutate(name=paste0(name,row_number())) %>%
pivot_wider(names_from = name,values_from=value) %>%
mutate(across(starts_with('RESULT'),~as.numeric(.)))
Output:
# A tibble: 2 x 7
# Groups: ACCT [2]
ACCT TEST_DATE1 RESULT1 TEST_DATE2 RESULT2 TEST_DATE3 RESULT3
<int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Some data used:
#Data
df <- structure(list(ACCT = c(37733L, 37733L, 37733L, 37734L, 37734L,
37734L), TEST_DATE = c("9/1/2016", "10/18/2016", "11/1/2016",
"9/1/2016", "10/18/2016", "11/1/2016"), RESULT = c(3L, 2L, 1L,
5L, 4L, 3L)), class = "data.frame", row.names = c(NA, -6L))
Here is a data.table option with dcast that might help (borrow data from #Duck)
> dcast(setDT(df)[, Q := seq(.N), ACCT], ACCT ~ Q, value.var = c("TEST_DATE", "RESULT"))
ACCT TEST_DATE_1 TEST_DATE_2 TEST_DATE_3 RESULT_1 RESULT_2 RESULT_3
1: 37733 9/1/2016 10/18/2016 11/1/2016 3 2 1
2: 37734 9/1/2016 10/18/2016 11/1/2016 5 4 3
Another option is using melt along with dcast, where the resulting format might be the one you are exactly after
suppressWarnings({
type.convert(
dcast(
melt(
setDT(df)[, Q := seq(.N), ACCT],
id = c("ACCT", "Q"),
measure = c("TEST_DATE", "RESULT")
)[order(ACCT, Q)],
ACCT ~ Q + variable,
value.var = "value"
),
as.is = TRUE
)
})
which gives
ACCT 1_TEST_DATE 1_RESULT 2_TEST_DATE 2_RESULT 3_TEST_DATE 3_RESULT
1: 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2: 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Take this simple route
library(tidyverse)
df %>% group_by(ACCT, TEST_DATE) %>% summarise(RESULT = mean(RESULT)) %>%
group_by(ACCT) %>% mutate(testno = row_number(), resultno = row_number()) %>%
pivot_wider(id_cols = ACCT, names_from = c("testno", "resultno"), values_from = c(TEST_DATE, RESULT))
# A tibble: 2 x 9
# Groups: ACCT [2]
ACCT TEST_DATE_1_1 TEST_DATE_2_2 TEST_DATE_3_3 TEST_DATE_4_4 RESULT_1_1 RESULT_2_2 RESULT_3_3 RESULT_4_4
<int> <date> <date> <date> <date> <dbl> <dbl> <dbl> <dbl>
1 37733 2016-01-07 2016-01-09 2016-01-11 2016-08-10 5 4.5 1 2
2 37734 2016-01-21 2016-08-20 NA NA 3 4 NA NA
data (dput) used
> dput(df)
structure(list(ACCT = c(37733L, 37733L, 37733L, 37733L, 37734L,
37734L, 37733L), TEST_DATE = structure(c(16809, 17023, 16811,
16807, 17033, 16821, 16809), class = "Date"), RESULT = c(3L,
2L, 1L, 5L, 4L, 3L, 6L)), row.names = c(NA, -7L), class = "data.frame")
df
> df
ACCT TEST_DATE RESULT
1 37733 2016-01-09 3
2 37733 2016-08-10 2
3 37733 2016-01-11 1
4 37733 2016-01-07 5
5 37734 2016-08-20 4
6 37734 2016-01-21 3
7 37733 2016-01-09 6

how to filter data based on the latest date of a date group?

i know my question is not as clear as it should be so i hope my explanation will make it more comprehensible. I have a data like this:
# total_call data
call_id | from_number | retrieved_date
1 1 2020-01-12 12:03:34
2 1 2020-01-12 12:06:34
3 2 2020-01-15 13:02:40
4 2 2020-01-15 13:05:40
5 1 2020-01-12 13:09:34
I want to group the calls by the from_number and the retrieved_date variables, which its time must be within 1 hour since the earliest. After 1 hour, it belongs to a new group. Then i want to filter the latest time of each group. This is the result i want:
# total_call data
call_id | from_number | retrieved_date
2 1 2020-01-12 12:06:34
4 2 2020-01-15 13:05:40
5 1 2020-01-12 13:09:34
Thanks for your attention. I’m looking forward to your reply.
We convert retrieved_date to POSIXct format, arrange the data and create a new group when the current retrieved_date is greater than previous retrieved_date by more than an hour and select the row with max retrieved_date.
library(dplyr)
df %>%
mutate(retrieved_date = lubridate::ymd_hms(retrieved_date)) %>%
arrange(from_number, retrieved_date) %>%
group_by(from_number) %>%
group_by(gr = cumsum(difftime(retrieved_date, lag(retrieved_date,
default = first(retrieved_date)), units = "hours") > 1),add = TRUE) %>%
slice(which.max(retrieved_date)) %>%
ungroup() %>%
select(-gr)
# A tibble: 3 x 3
# call_id from_number retrieved_date
# <int> <int> <dttm>
#1 2 1 2020-01-12 12:06:34
#2 5 1 2020-01-12 13:09:34
#3 4 2 2020-01-15 13:05:40
data
df <- structure(list(call_id = 1:5, from_number = c(1L, 1L, 2L, 2L,
1L), retrieved_date = structure(c(1L, 2L, 4L, 5L, 3L),
.Label = c("2020- 01-12 12:03:34","2020-01-12 12:06:34", "2020-01-12 13:09:34",
"2020-01-15 13:02:40", "2020-01-15 13:05:40"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))

Apply rules when filtering on grouped dataframe in R?

Given the following dataframe:
structure(list(press_id = c(1L, 1L, 1L, 1L, 1L), time_state = c("start_time",
"end_time", "start_time", "end_time", "start_time"), time_state_val = c(164429106667745,
164429180716697, 164429106667745, 164429180716697, 164429106667745
), timestamp = c(164429106667745, 164429106667745, 164429106667745,
164429106667745, 164429108669078), acc_mag = c(10.4656808698978,
10.4656808698978, 10.4656808698978, 10.4656808698978, 10.458666511955
)), .Names = c("press_id", "time_state", "time_state_val", "timestamp",
"acc_mag"), row.names = c(NA, -5L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = "press_id", drop = TRUE, indices = list(
0:4), group_sizes = 5L, biggest_group_size = 5L, labels = structure(list(
press_id = 1L), row.names = c(NA, -1L), class = "data.frame", vars = "press_id", drop = TRUE, .Names = "press_id"))
I want to apply "rules" when filtering: if time_state == "start_time" then check time_state_interval == min(timestamp) and if it's "end_time" check equality to max(timestamp).
How can I perform such ruled based filter? I am trying to do it with case_when but it doesn't produce the expected result.
df1 %>%
group_by(press_id) %>%
mutate(row = row_number(),
start_time = min(timestamp),
end_time = max(timestamp)) %>%
gather(time_state , time_state_val, -press_id, -row,-timestamp:-vel_ang_mag_avg) %>%
arrange(press_id, row) %>%
select(press_id, time_state, time_state_val, timestamp, acc_mag, vel_ang_mag, -row) %>%
group_by(press_id, time_state) %>%
filter(timestamp == case_when(time_state == "start_time" ~ min(timestamp),
time_state == "end_time" ~ max(timestamp)))
Is this what you have in mind?
df1 %>%
filter((time_state == "start_time" & timestamp == min(timestamp)) |
(time_state == "end_time" & timestamp == max(timestamp)))
# press_id time_state time_state_val timestamp acc_mag
# <int> <chr> <dbl> <dbl> <dbl>
# 1 1 start_time 1.64e14 1.64e14 10.5
# 2 1 start_time 1.64e14 1.64e14 10.5
Try
data %>% group_by(press_id, time_state) %>%
mutate(start_flag=ifelse(time_state=='start_time' & timestamp==min(timestamp),1,0),
end_flag=ifelse(time_state=='end_time' & timestamp==max(timestamp),1,0)) %>%
filter(start_flag==1 | end_flag==1)
# A tibble: 4 x 7
# Groups: press_id, time_state [2]
press_id time_state time_state_val timestamp acc_mag start_flag end_flag
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 start_time 1.64e14 1.64e14 10.5 1 0
2 1 end_time 1.64e14 1.64e14 10.5 0 1
3 1 start_time 1.64e14 1.64e14 10.5 1 0
4 1 end_time 1.64e14 1.64e14 10.5 0 1

Average data by month for a given latitude and longitude?

I have a table with the following headers and example data
Lat Long Date Value.
30.497478 -87.880258 01/01/2016 10
30.497478 -87.880258 01/02/2016 15
30.497478 -87.880258 01/05/2016 20
33.284928 -85.803608 01/02/2016 10
33.284928 -85.803608 01/03/2016 15
33.284928 -85.803608 01/05/2016 20
I would like to average the value column on monthly basis for a particular location.
So example output would be
Lat Long Month Avg Value
30.497478 -87.880258 January 15
A solution using dplyr and lubridate.
library(dplyr)
library(lubridate)
dt2 <- dt %>%
mutate(Date = mdy(Date), Month = month(Date)) %>%
group_by(Lat, Long, Month) %>%
summarise(`Avg Value` = mean(Value))
dt2
# A tibble: 2 x 4
# Groups: Lat, Long [?]
Lat Long Month `Avg Value`
<dbl> <dbl> <dbl> <dbl>
1 30.49748 -87.88026 1 15
2 33.28493 -85.80361 1 15
You can try the following, but it first modifies the data frame adding an extra column, Month, using package zoo.
library(zoo)
dat$Month <- as.yearmon(as.Date(dat$Date, "%m/%d/%Y"))
aggregate(Value. ~ Lat + Long + Month, dat, mean)
# Lat Long Month Value.
#1 30.49748 -87.88026 jan 2016 15
#2 33.28493 -85.80361 jan 2016 15
If you don't want to change the original data, make a copy dat2 <- dat and change the copy.
DATA
dat <-
structure(list(Lat = c(30.497478, 30.497478, 30.497478, 33.284928,
33.284928, 33.284928), Long = c(-87.880258, -87.880258, -87.880258,
-85.803608, -85.803608, -85.803608), Date = structure(c(1L, 2L,
4L, 2L, 3L, 4L), .Label = c("01/01/2016", "01/02/2016", "01/03/2016",
"01/05/2016"), class = "factor"), Value. = c(10L, 15L, 20L, 10L,
15L, 20L)), .Names = c("Lat", "Long", "Date", "Value."), class = "data.frame", row.names = c(NA,
-6L))
EDIT.
If you want to compute several statistics, you can define a function that computes them and returns a named vector and call it in aggregate, like the following.
stat <- function(x){
c(Mean = mean(x), Median = median(x), SD = sd(x))
}
agg <- aggregate(Value. ~ Lat + Long + Month, dat, stat)
agg <- cbind(agg[1:3], as.data.frame(agg[[4]]))
agg
# Lat Long Month Mean Median SD
#1 30.49748 -87.88026 jan 2016 15 15 5
#2 33.28493 -85.80361 jan 2016 15 15 5

Resources