I have data as given in input section (dput below), need to convert to output with all values of two rows in one long column. I tried using transpose but cells were getting trimmed.
I don't want to hardcode since in future I might have data in 3 or 4 rows in a similar way.
P.S - I also tried pivot_longer but it didnt help
structure(list(Header = c("Sat 12/3 \n358a-947a\n1017a-229p HRS 10.02",
"Sat 12/10 \n559a-1106a\n1134a-227p HRS 8.00"), X = c("Sun 12/4 ",
"Sun 12/11 "), X.1 = c("Mon 12/5 \n548a-1121a\n1149a-618p\n650p-845p HRS 13.95",
"Mon 12/12 \n500a-1121a\n1151a-547p\n616p-830p HRS 14.53"),
X.2 = c("Tue 12/6 \n359a-1120a\n1150a-400p HRS 11.53",
"Tue 12/13 \n548a-1120a\n1148a-449p HRS 10.54"), X.3 = c("Wed 12/7 \n548a-1119a\n1149a-515p HRS 10.95",
"Wed 12/14 \n429a-1120a\n1150a-432p HRS 11.56"), X.4 = c("Thu 12/8 \n549a-1120a\n1149a-447p HRS 10.48",
"Thu 12/15 \n429a-1121a\n1152a-431p HRS 11.52"), X.5 = c("Fri 12/9 \n548a-1120a\n1148a-218p HRS 8.03",
"Fri 12/16 \n430a-1120a\n1150a-432p HRS 11.55")), class = "data.frame", row.names = c(NA,
-2L))
My try (with a little help)
pivot_longer(df, cols = c(1:7)) %>%
select(value) %>%
mutate(value=str_replace(value,"HRS","")) %>%
separate(.,value,into=c("day","entry1","entry2","entry3"),sep="\n") %>%
separate(.,entry1,into=c("time_in1","time_out1"),sep="-") %>%
separate(.,entry2,into=c("time_in2","time_out2"),sep="-") %>%
separate(.,time_out2,into=c("time_out2","duration1"),remove = FALSE,sep=" ",extra = "merge") %>%
separate(.,entry3,into=c("time_in3","time_out3"),sep="-") %>%
separate(.,time_out3,into=c("time_out3","duration2"),remove = FALSE,sep=" ") %>%
mutate(duration=coalesce(duration1,duration2)) %>%
select(day, duration, time_in1,time_out1,time_in2,time_out2,time_in3,time_out3) %>%
separate(.,day,into=c("date","day"),extra="merge") %>%
mutate(day=mdy(paste0(day,"2021")),
duration=str_trim(duration))
Approach
The key was tidyr::separate_rows(), which not only separates the cell by "\n" but also splits the components into rows rather than columns.
Here, it is much better to split into rows than into columns. Suppose that most cells have 2 or 3 entries separated by "\n"; but there is a "rogue" cell, with an unusually large number (say 9) of entries, generated by someone who repeatedly clocked in and out throughout the day.
While splitting into columns would create arbitrarily many time_in* | time_out* columns, which remain empty (NA) in all rows except the "rogue"
date day duration time_in1 time_out1 time_in2 time_out2 time_in3 time_out3 time_in4 time_out4 time_in5 time_out5 time_in6 time_out6 time_in7 time_out7 time_in8 time_out8 time_in9 time_out9
<chr> <date> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# ... ... ... ... ... ... ... ... ... NA NA NA NA NA NA NA NA NA NA NA NA
splitting into rows will maintain a tame (and stable) columnar structure
date day duration time_in time_out
<date> <chr> <dbl> <chr> <chr>
# ... ... ... ... ...
# ... ... ... ... ...
# ... ... ... ... ...
without any "extraneous" columns (or rows).
Solution
Given your sample data df
df <- structure(list(Header = c("Sat 12/3 \n358a-947a\n1017a-229p HRS 10.02", "Sat 12/10 \n559a-1106a\n1134a-227p HRS 8.00"),
X = c("Sun 12/4 ", "Sun 12/11 "),
X.1 = c("Mon 12/5 \n548a-1121a\n1149a-618p\n650p-845p HRS 13.95", "Mon 12/12 \n500a-1121a\n1151a-547p\n616p-830p HRS 14.53"),
X.2 = c("Tue 12/6 \n359a-1120a\n1150a-400p HRS 11.53", "Tue 12/13 \n548a-1120a\n1148a-449p HRS 10.54"),
X.3 = c("Wed 12/7 \n548a-1119a\n1149a-515p HRS 10.95", "Wed 12/14 \n429a-1120a\n1150a-432p HRS 11.56"),
X.4 = c("Thu 12/8 \n549a-1120a\n1149a-447p HRS 10.48", "Thu 12/15 \n429a-1121a\n1152a-431p HRS 11.52"),
X.5 = c("Fri 12/9 \n548a-1120a\n1148a-218p HRS 8.03", "Fri 12/16 \n430a-1120a\n1150a-432p HRS 11.55")),
class = "data.frame", row.names = c(NA, -2L))
the following workflow
library(tidyverse)
library(stringr)
# ...
# Code to generate 'df'.
# ...
year_observed <- 2016
results <- df %>%
mutate(id = row_number()) %>%
pivot_longer(!id, names_to = "column") %>%
separate(value, into = c("date", "entries"), sep = "\n", fill = "right", extra = "merge", remove = TRUE) %>%
separate(entries, into = c("times", "duration"), sep = "HRS", fill = "right", extra = "warn", remove = TRUE) %>%
mutate(across(date:duration, trimws),
date = as.Date(paste(str_extract(date, "\\d{1,2}/\\d{1,2}$"), year_observed, sep = "/"), format = "%m/%d/%Y"),
duration = as.numeric(duration),
duration = if_else(is.na(duration), 0, duration),
day = format(date, format = "%a")) %>%
separate_rows(times, sep = "\n") %>%
separate(times, into = c("time_in", "time_out"), sep = "-", fill = "warn", extra = "warn", remove = TRUE) %>%
# ...Further Transformations... %>%
select(id, date, day, duration, time_in, time_out)
# View results.
results
should yield results like
# A tibble: 28 x 6
id date day duration time_in time_out
<int> <date> <chr> <dbl> <chr> <chr>
1 1 2016-12-03 Sat 10.0 358a 947a
2 1 2016-12-03 Sat 10.0 1017a 229p
3 1 2016-12-04 Sun 0 NA NA
4 1 2016-12-05 Mon 14.0 548a 1121a
5 1 2016-12-05 Mon 14.0 1149a 618p
6 1 2016-12-05 Mon 14.0 650p 845p
7 1 2016-12-06 Tue 11.5 359a 1120a
8 1 2016-12-06 Tue 11.5 1150a 400p
9 1 2016-12-07 Wed 11.0 548a 1119a
10 1 2016-12-07 Wed 11.0 1149a 515p
# ... with 18 more rows
where id identifies (by row number) the original record in df.
To pivot into your newly specified output, simply execute this code, or append it to the existing workflow:
wide_results <- results %>%
group_by(id, date) %>% mutate(entry = row_number()) %>% ungroup() %>%
pivot_wider(id_cols = c(date, day, duration), names_from = entry, names_glue = "{.value}_{entry}", values_from = c(time_in, time_out)) %>%
# Select so as to alternate between 'time_in_*' and 'time_out_*'.
select(order(as.numeric(str_extract(colnames(.), "\\d+$")), str_extract(colnames(.), "^time_(in|out)"), na.last = FALSE))
# View results.
wide_results
You should obtain wide_results like:
# A tibble: 14 x 9
date day duration time_in_1 time_out_1 time_in_2 time_out_2 time_in_3 time_out_3
<date> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2016-12-03 Sat 10.0 358a 947a 1017a 229p NA NA
2 2016-12-04 Sun 0 NA NA NA NA NA NA
3 2016-12-05 Mon 14.0 548a 1121a 1149a 618p 650p 845p
4 2016-12-06 Tue 11.5 359a 1120a 1150a 400p NA NA
5 2016-12-07 Wed 11.0 548a 1119a 1149a 515p NA NA
6 2016-12-08 Thu 10.5 549a 1120a 1149a 447p NA NA
7 2016-12-09 Fri 8.03 548a 1120a 1148a 218p NA NA
8 2016-12-10 Sat 8 559a 1106a 1134a 227p NA NA
9 2016-12-11 Sun 0 NA NA NA NA NA NA
10 2016-12-12 Mon 14.5 500a 1121a 1151a 547p 616p 830p
11 2016-12-13 Tue 10.5 548a 1120a 1148a 449p NA NA
12 2016-12-14 Wed 11.6 429a 1120a 1150a 432p NA NA
13 2016-12-15 Thu 11.5 429a 1121a 1152a 431p NA NA
14 2016-12-16 Fri 11.6 430a 1120a 1150a 432p NA NA
Note
You must supply the year_observed (here 2016) to correctly contextualize the dates written in m/d format. Otherwise, they will calibrate to the year 2021, which will skew the days of the week.
Warning
These dates (12/3, etc.) are in December, and close to the end of the calendar year. If any of these entries "cross over" (from 2016) into the next year (ex. 1/1/2017), they will be incorrectly calibrated to the former year (ex. 1/1/2016), and thus have an incorrect date and weekday.
However, if your dates do cross over, that's a good indication that the full date (12/3/2016) should have been notated in the original cells, in which case
results <- df %>%
# ... %>%
mutate(
# ...
date = as.Date(str_extract(date, "(\\d{1,2}/){2,2}\\d{4,4}$"), format = "%m/%d/%Y")
# ...
) # ... %>%
would have sufficed to properly parse the dates.
Related
I have question related to calculation based on timestamp:
I have a big dataframe dfwith Timestamp (whole year), Export_Country, Import_Country and respective Value in each hour of the year.
For example here is the sample dataframe df:
df <- data.frame(Timestamp=c("2020-01-01 00:00:00.000","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000"),
Export_Country=c('AT','DE','CH','DE','CZ','DE'),
Import_Country=c('DE','AT','DE','CH','DE','CZ'),
Value=c(170.06,289.37,1133.47,0,68.29,0.32),
stringsAsFactors=FALSE)
I want to write a function which can calculate the net value in each timestamp within two countries. The output should look like the dataframe df1:
df2<- data.frame(Timestamp=c("2F020-01-01 00:00:00.000","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000"),
Export_Country=c('DE','CH','CZ'),
Import_Country=c('AT','DE','DE'),
Value=c(119.31,1133.47,67.97),
stringsAsFactors=FALSE)
I was trying to do something like:
df3<- df %<>%
group_by(Timestamp,Export_Country,Import_Country) %>%
summarise(Value=sum(Value))
Note this is output of str(mydataframe)
'data.frame': 65520 obs. of 4 variables:
$ DateTime : chr "2020-01-02 12:00:00.000" "2020-01-02 12:00:00.000" "2020-01-02 12:00:00.000" "2020-01-02 12:00:00.000" ...
$ Export_Country: Factor w/ 70 levels "AL","AT","BA",..: 15 13 15 10 13 2 53 13 46 10 ...
$ Import_Country: Factor w/ 70 levels "AL","AT","BA",..: 10 46 13 15 2 13 10 15 13 53 ...
$ FlowValue : num 417 251 898 0 1089 ...
Can anyone help me? Thank you.
Using tidyverse we can pivot the data to longer format and
# gets the next other country's index based on the current country index
funcp <- function(x) x + 1 - 2 * (x%%2 == 0)
df %>%
# pivoting to longer format in order to facilitate data manipulation
pivot_longer(cols=ends_with("Country"), values_to = "country") %>%
# remove _Country from (Import|Export)_Country and getting the real value of the transaction Imports = - Value
mutate(name=sub("_.+","", name), Value=Value*(1-2*(name=="Import"))) %>%
# adding a with column that contains the counterpart
tibble(with=.$country[funcp(1:nrow(.))]) %>%
# finally grouping by the Timestamp, the country and the counterpart to get the actual Net value
group_by(Timestamp, country, with) %>% summarise(Value=sum(Value)) -> df2
df2
#> # A tibble: 6 x 4
#> # Groups: Timestamp, country [4]
#> Timestamp country with Value
#> <chr> <chr> <chr> <dbl>
#> 1 2020-01-01 00:00:00.000 AT DE -119.
#> 2 2020-01-01 00:00:00.000 CH DE 1133.
#> 3 2020-01-01 00:00:00.000 CZ DE 68.0
#> 4 2020-01-01 00:00:00.000 DE AT 119.
#> 5 2020-01-01 00:00:00.000 DE CH -1133.
#> 6 2020-01-01 00:00:00.000 DE CZ -68.0
If you want to get only the positive nets then you can filter the results :
df2 %>% filter(Value >=0)
#> # A tibble: 3 x 4
#> # Groups: Timestamp, country [3]
#> Timestamp country with Value
#> <chr> <chr> <chr> <dbl>
#> 1 2020-01-01 00:00:00.000 CH DE 1133.
#> 2 2020-01-01 00:00:00.000 CZ DE 68.0
#> 3 2020-01-01 00:00:00.000 DE AT 119.
Note : czech to germany Value is rounded in the printing process but is equal to 67.97 in the tibble
I know that the following function is complicated and that problably there are simpler solutions but it seems to work.
fun <- function(X){
f <- function(x){
x[[3]]*(2*(x[[1]] < x[[2]]) - 1)
}
icontr <- grep("Country", names(X), value = TRUE)
X[["Value"]] <- f(X[c(icontr, "Value")])
X[icontr] <- t(apply(X[icontr], 1, sort))
fmla <- paste(c("Timestamp", icontr), collapse = "+")
fmla <- paste("Value", fmla, sep = "~")
fmla <- as.formula(fmla)
out <- aggregate(fmla, X, sum)
i <- out[["Value"]] < 0
tmp <- out[["Export_Country"]][i]
out[["Export_Country"]][i] <- out[["Import_Country"]][i]
out[["Import_Country"]][i] <- tmp
out[["Value"]][i] <- -out[["Value"]][i]
out
}
fun(df)
# Timestamp Export_Country Import_Country Value
#1 2020-01-01 00:00:00.000 DE AT 119.31
#2 2020-01-01 00:00:00.000 CH DE 1133.47
#3 2020-01-01 00:00:00.000 CZ DE 67.97
all.equal(fun(df), df2)
#[1] TRUE
Maybe you might want to combine Import_Country and Export_Country into a single character string? Then you can group_by this, and take difference between the two Values present. This assumes you only have two countries to combine for each Timestamp. It also subtracts import from export consistently.
library(tidyverse)
df %>%
mutate(CountryDyad = paste(pmin(Export_Country, Import_Country),
pmax(Export_Country, Import_Country),
sep = "-")) %>%
group_by(Timestamp, CountryDyad) %>%
summarise(Value = Value[which(startsWith(CountryDyad, Import_Country))] -
Value[which(startsWith(CountryDyad, Export_Country))])
Output
Timestamp CountryDyad Value
<chr> <chr> <dbl>
1 2020-01-01 00:00:00.000 AT-DE 119.
2 2020-01-01 00:00:00.000 CH-DE -1133.
3 2020-01-01 00:00:00.000 CZ-DE -68.0
I want to create a dataframe from a given start and end date:
start_date <- as.Date("2020-05-17")
end_date <- as.Date("2020-06-23")
For each row in this dataframe, I should have the start day and end day of the month, so the expected output is:
start end month year
2020-05-17 2020-05-31 May 2020
2020-06-01 2020-06-23 June 2020
I have tried to create a sequence, but I'm stuck on what to do next:
day_seq <- seq(start_date, end_date, 1)
Please, a base R or tidyverse solution will be greatly appreciated.
1) yearmon Using start_date and end_date from the question create a yearmon sequence and then each of the desired columns is a simple one line computation. The stringAsFactors line can be omitted under R 4.0 onwards as that is the default there.
library(zoo)
ym <- seq(as.yearmon(start_date), as.yearmon(end_date), 1/12)
data.frame(start = pmax(start_date, as.Date(ym)),
end = pmin(end_date, as.Date(ym, frac = 1)),
month = month.name[cycle(ym)],
year = as.integer(ym),
stringsAsFactors = FALSE)
giving:
start end month year
1 2020-05-17 2020-05-31 May 2020
2 2020-06-01 2020-06-23 June 2020
2) Base R This follows similar logic and gives the same answer. We first define a function month1 which given a Date class vector x returns a Date vector the same length but for the first of the month.
month1 <- function(x) as.Date(cut(x, "month"))
months <- seq(month1(start_date), month1(end_date), "month")
data.frame(start = pmax(start_date, months),
end = pmin(end_date, month1(months + 31) - 1),
month = format(months, "%B"),
year = as.numeric(format(months, "%Y")),
stringsAsFactors = FALSE)
A while ago that I used the tidyverse, but here is my go at things..
sample data
different sample data to tagckle some problems wher the year changes..
start_date <- as.Date("2020-05-17")
end_date <- as.Date("2021-06-23")
code
library( tidyverse )
library( lubridate )
#create a sequence of days from start to end
tibble( date = seq( start_date, end_date, by = "1 day" ) ) %>%
mutate( month = lubridate::month( date ),
year = lubridate::year( date ),
end = as.Date( paste( year, month, lubridate::days_in_month(date), sep = "-" ) ) ) %>%
#the end of the last group is now always larger than tghe maximum date... repair!
mutate( end = if_else( end > max(date), max(date), end ) ) %>%
group_by( year, month ) %>%
summarise( start = min( date ),
end = max( end ) ) %>%
select( start, end, month, year )
output
# # A tibble: 14 x 4
# # Groups: year [2]
# start end month year
# <date> <date> <dbl> <dbl>
# 1 2020-05-17 2020-05-31 5 2020
# 2 2020-06-01 2020-06-30 6 2020
# 3 2020-07-01 2020-07-31 7 2020
# 4 2020-08-01 2020-08-31 8 2020
# 5 2020-09-01 2020-09-30 9 2020
# 6 2020-10-01 2020-10-31 10 2020
# 7 2020-11-01 2020-11-30 11 2020
# 8 2020-12-01 2020-12-31 12 2020
# 9 2021-01-01 2021-01-31 1 2021
# 10 2021-02-01 2021-02-28 2 2021
# 11 2021-03-01 2021-03-31 3 2021
# 12 2021-04-01 2021-04-30 4 2021
# 13 2021-05-01 2021-05-31 5 2021
# 14 2021-06-01 2021-06-23 6 2021
For the specific period in your question, you may use:
library(lubridate)
start_date <- as.Date("2020-05-17")
end_date <- as.Date("2020-06-23")
start <- c(start_date, floor_date(end_date, unit = 'months'))
end <- c(ceiling_date(start_date, unit = 'months'), end_date)
month <- c(as.character(month(start[1], label = TRUE)),
as.character(month(start[2], label = TRUE)))
year <- c(year(start[1]), year(start[2]))
data.frame(start, end, month, year, stringsAsFactors = FALSE)
Here is one approach using intervals with lubridate. You would create a full interval between the 2 dates of interest, and then intersect with monthly ranges for each month (first to last day each month).
library(tidyverse)
library(lubridate)
start_date <- as.Date("2020-05-17")
end_date <- as.Date("2021-08-23")
full_int <- interval(start_date, end_date)
month_seq = seq(start_date, end_date, by = "month")
month_int = interval(floor_date(month_seq, "month"), ceiling_date(month_seq, "month") - days(1))
data.frame(interval = intersect(full_int, month_int)) %>%
mutate(start = int_start(interval),
end = int_end(interval),
month = month.abb[month(start)],
year = year(start)) %>%
select(-interval)
Output
start end month year
1 2020-05-17 2020-05-31 May 2020
2 2020-06-01 2020-06-30 Jun 2020
3 2020-07-01 2020-07-31 Jul 2020
4 2020-08-01 2020-08-31 Aug 2020
5 2020-09-01 2020-09-30 Sep 2020
6 2020-10-01 2020-10-31 Oct 2020
7 2020-11-01 2020-11-30 Nov 2020
8 2020-12-01 2020-12-31 Dec 2020
9 2021-01-01 2021-01-31 Jan 2021
10 2021-02-01 2021-02-28 Feb 2021
11 2021-03-01 2021-03-31 Mar 2021
12 2021-04-01 2021-04-30 Apr 2021
13 2021-05-01 2021-05-31 May 2021
14 2021-06-01 2021-06-30 Jun 2021
15 2021-07-01 2021-07-31 Jul 2021
16 2021-08-01 2021-08-23 Aug 2021
I have hourly values of temperature measurements and I wish to calculate the average per day only for complete (i.e. with 24 measurements) days. Incomplete days would then be summarized as "NA".
I have grouped the values together per year, month and day and call summarize().
I have three month of data missing which appears as a gap in my ggplot function and which is what I want to achieve with the rest. The problem is that when I call summarize() to calculate the mean of my values, days with only 1 or 2 measurements also get called. Only those with all missing values (24) appear as "NA".
Date TempUrb TempRur UHI
1 2011-03-21 22:00:00 10.1 11.67000 -1.570000
2 2011-03-21 23:00:00 9.9 11.67000 -1.770000
3 2011-03-22 00:00:00 10.9 11.11000 -0.210000
4 2011-03-22 01:00:00 10.7 10.56000 0.140000
5 2011-03-22 02:00:00 9.7 10.00000 -0.300000
6 2011-03-22 03:00:00 9.5 10.00000 -0.500000
7 2011-03-22 04:00:00 9.4 8.89000 0.510000
8 2011-03-22 05:00:00 8.4 8.33500 0.065000
9 2011-03-22 06:00:00 8.2 7.50000 0.700000
AvgUHI <- UHI %>% group_by(year(Date), add = TRUE) %>%
group_by(month(Date), add = TRUE) %>%
group_by(day(Date), add = TRUE, .drop = TRUE) %>%
summarize(AvgUHI = mean(UHI, na.rm = TRUE))
# A tibble: 2,844 x 4
# Groups: year(Date), month(Date) [95]
`year(Date)` `month(Date)` `day(Date)` AvgUHI
<int> <int> <int> <dbl>
1476 2015 4 4 0.96625000
1477 2015 4 5 -0.11909722
1478 2015 4 6 -0.60416667
1479 2015 4 7 -0.92916667
1480 2015 4 8 NA
1481 2015 4 9 NA
AvgUHI<- AvgUHI %>% group_by(`year(Date)`, add = TRUE) %>%
group_by(`month(Date)`, add = TRUE) %>%
summarize(AvgUHI= mean(AvgUHI, na.rm = TRUE))
# A tibble: 95 x 3
# Groups: year(Date) [9]
`year(Date)` `month(Date)` AvgUHI
<int> <int> <dbl>
50 2015 4 0.580887346
51 2015 5 0.453815051
52 2015 6 0.008479618
As you can see above on the final table, I have an average for 04-2015, while I am missing data on that month (08 - 09/04/2015 on this example represented on the second table).
The same happens when I calculate AvgUHI and I'm missing hourly data.
I simply would like to see on the last table the AvgUHI for 04-2015 be NA.
E.g: of my graph1
The following will give a dataframe aggregated by day, where only the complete days, with 4 observations, are not NA. Then you can group by month to have the final dataframe.
UHI %>%
mutate(Day = as.Date(Date)) %>%
group_by(Day) %>%
mutate(n = n(), tmpUHI = if_else(n == 24, UHI, NA_real_)) %>%
summarize(AvgUHI = mean(tmpUHI)) %>%
full_join(data.frame(Day = seq(min(.$Day), max(.$Day), by = "day"))) %>%
arrange(Day) -> AvgUHI
For hours look at Rui Barradas' answer. For months the following code worked:
AvgUHI %>%
group_by(year(Day), add = TRUE) %>%
group_by(month(Day), add = TRUE) %>%
mutate(sum = sum(is.na(AvgUHI)), tmpUHI = if_else(sum <= 10, AvgUHI, NA_real_)) %>%
summarise(AvgUHI = mean(tmpUHI, na.rm = TRUE)) -> AvgUHI
This question already has an answer here:
Sort year-month column by year AND month
(1 answer)
Closed 1 year ago.
I have dates in the format mm/yyyy in column 1, and then results in column 2.
month Result
01/2018 96.13636
02/2018 96.40000
3/2018 94.00000
04/2018 97.92857
05/2018 95.75000
11/2017 98.66667
12/2017 97.78947
How can I order by month such that it will start from the first month (11/2017) and end (05/2018).
I have tried a few 'orders', but none seem to be ordering by year and then by month
In tidyverse (w/ lubridate added):
library(tidyverse)
library(lubridate)
dfYrMon <-
df1 %>%
mutate(date = parse_date_time(month, "my"),
year = year(date),
month = month(date)
) %>%
arrange(year, month) %>%
select(date, year, month, result)
With data:
df1 <- tibble(month = c("01/2018", "02/2018", "03/2018", "04/2018", "05/2018", "11/2017", "12/2017"),
result = c(96.13636, 96.4, 94, 97.92857, 95.75, 98.66667, 97.78947))
Will get you this 'dataframe':
# A tibble: 7 x 4
date year month result
<dttm> <dbl> <dbl> <dbl>
1 2017-11-01 2017 11 98.66667
2 2017-12-01 2017 12 97.78947
3 2018-01-01 2018 1 96.13636
4 2018-02-01 2018 2 96.40000
5 2018-03-01 2018 3 94.00000
6 2018-04-01 2018 4 97.92857
7 2018-05-01 2018 5 95.75000
Making your data values atomic (year in its own column, month in its own column) generally improves the ease of manipulation.
Or if you want to use base R date manipulations instead of lubridate's:
library(tidyverse)
dfYrMon_base <-
df1 %>%
mutate(date = as.Date(paste("01/", month, sep = ""), "%d/%m/%Y"),
year = format(as.Date(date, format="%d/%m/%Y"),"%Y"),
month = format(as.Date(date, format="%d/%m/%Y"),"%m")
) %>%
arrange(year, month) %>%
select(date, year, month, result)
dfYrMon_base
Note the datatypes created.
# A tibble: 7 x 4
date year month result
<date> <chr> <chr> <dbl>
1 2017-11-01 2017 11 98.66667
2 2017-12-01 2017 12 97.78947
3 2018-01-01 2018 01 96.13636
4 2018-02-01 2018 02 96.40000
5 2018-03-01 2018 03 94.00000
6 2018-04-01 2018 04 97.92857
7 2018-05-01 2018 05 95.75000
We can convert it to yearmon class and then do the order
library(zoo)
out <- df1[order(as.yearmon(df1$month, "%m/%Y"), df1$Result),]
row.names(out) <- NULL
out
# month Result
#1 11/2017 98.66667
#2 12/2017 97.78947
#3 01/2018 96.13636
#4 02/2018 96.40000
#5 03/2018 94.00000
#6 04/2018 97.92857
#7 05/2018 95.75000
data
df1 <- structure(list(month = c("01/2018", "02/2018", "03/2018", "04/2018",
"05/2018", "11/2017", "12/2017"), Result = c(96.13636, 96.4,
94, 97.92857, 95.75, 98.66667, 97.78947)), .Names = c("month",
"Result"), class = "data.frame",
row.names = c("1", "2", "3",
"4", "5", "6", "7"))
Suppose I have a daily rain data.frame like this:
df.meteoro = data.frame(Dates = seq(as.Date("2017/1/19"), as.Date("2018/1/18"), "days"),
rain = rnorm(length(seq(as.Date("2017/1/19"), as.Date("2018/1/18"), "days"))))
I'm trying to sum the accumulated rain between a 14 days interval with this code:
library(tidyverse)
library(lubridate)
df.rain <- df.meteoro %>%
mutate(TwoWeeks = round_date(df.meteoro$data, "14 days")) %>%
group_by(TwoWeeks) %>%
summarise(sum_rain = sum(rain))
The problem is that it isn't starting on 2017-01-19 but on 2017-01-15 and I was expecting my output dates to be:
"2017-02-02" "2017-02-16" "2017-03-02" "2017-03-16" "2017-03-30" "2017-04-13"
"2017-04-27" "2017-05-11" "2017-05-25" "2017-06-08" "2017-06-22" "2017-07-06" "2017-07-20"
"2017-08-03" "2017-08-17" "2017-08-31" "2017-09-14" "2017-09-28" "2017-10-12" "2017-10-26"
"2017-11-09" "2017-11-23" "2017-12-07" "2017-12-21" "2018-01-04" "2018-01-18"
TL;DR I have a year long daily rain data.frame and want to sum the accumulate rain for the dates above.
Please help.
Use of round_date in the way you have shown it will not give you 14-day periods as you might expect. I have taken a different approach in this solution and generated a sequence of dates between your first and last dates and grouped these into 14-day periods then joined the dates to your observations.
startdate = min(df.meteoro$Dates)
enddate = max(df.meteoro$Dates)
dateseq =
data.frame(Dates = seq.Date(startdate, enddate, by = 1)) %>%
mutate(group = as.numeric(Dates - startdate) %/% 14) %>%
group_by(group) %>%
mutate(starts = min(Dates))
df.rain <- df.meteoro %>%
right_join(dateseq) %>%
group_by(starts) %>%
summarise(sum_rain = sum(rain))
head(df.rain)
> head(df.rain)
# A tibble: 6 x 2
starts sum_rain
<date> <dbl>
1 2017-01-19 6.09
2 2017-02-02 5.55
3 2017-02-16 -3.40
4 2017-03-02 2.55
5 2017-03-16 -0.12
6 2017-03-30 8.95
Using a right-join to the date sequence is to ensure that if there are missing observation days that spanned a complete time period you'd still get that period listed in the result (though in your case you have a complete year of dates anyway).
round_date rounds to the nearest multiple of unit (here, 14 days) since some epoch (probably the Unix epoch of 1970-01-01 00:00:00), which doesn't line up with your purpose.
To get what you want, you can do the following:
df.rain = df.meteoro %>%
mutate(days_since_start = as.numeric(Dates - as.Date("2017/1/18")),
TwoWeeks = as.Date("2017/1/18") + 14*ceiling(days_since_start/14)) %>%
group_by(TwoWeeks) %>%
summarise(sum_rain = sum(rain))
This computes days_since_start as the days since 2017/1/18 and then manually rounds to the next multiple of two weeks.
Assuming you want to round to the closest date from the ones you have specified I guess the following will work
targetDates<-seq(ymd("2017-02-02"),ymd("2018-01-18"),by='14 days')
df.meteoro$Dates=targetDates[sapply(df.meteoro$Dates,function(x) which.min(abs(interval(targetDates,x))))]
sum_rain=ddply(df.meteoro,.(Dates),summarize,sum_rain=sum(rain,na.rm=T))
as you can see not all dates have the same number of observations. Date "2017-02-02" for instance has all the records between "2017-01-19" until "2017-02-09", which are 22 records. From "2017-02-10" on dates are rounded to "2017-02-16" etc.
This may be a cheat, but assuming each row/observation is a separate day, then why not just group by every 14 rows and sum.
# Assign interval groups, each 14 rows
df.meteoro$my_group <-rep(1:100, each=14, length.out=nrow(df.meteoro))
# Grab Interval Names
my_interval_names <- df.meteoro %>%
select(-rain) %>%
group_by(my_group) %>%
slice(1)
# Summarise
df.meteoro %>%
group_by(my_group) %>%
summarise(rain = sum(rain)) %>%
left_join(., my_interval_names)
#> Joining, by = "my_group"
#> # A tibble: 27 x 3
#> my_group rain Dates
#> <int> <dbl> <date>
#> 1 1 3.86 2017-01-19
#> 2 2 -0.581 2017-02-02
#> 3 3 -0.876 2017-02-16
#> 4 4 1.80 2017-03-02
#> 5 5 3.79 2017-03-16
#> 6 6 -3.50 2017-03-30
#> 7 7 5.31 2017-04-13
#> 8 8 2.57 2017-04-27
#> 9 9 -1.33 2017-05-11
#> 10 10 5.41 2017-05-25
#> # ... with 17 more rows
Created on 2018-03-01 by the reprex package (v0.2.0).