how to group same years instead of same month in r - r

My data is grouped in month-year format under names of corresponding months. But I want to group them under their corresponding years.
I have a list like below: values are grouped under same months.
$Apr
$Apr$`04-2036`
date value
116 04-25-2036 1.14
117 04-26-2036 0.67
$Apr$`04-2037`
date value
478 04-22-2037 0
479 04-23-2037 0
$Mar
$Mar$`03-2037`
date value
446 03-21-2037 1.67
447 03-22-2037 0.00
$May
$May$`05-2036`
date value
146 05-25-2036 0.00
147 05-26-2036 2.31
And here is its structure:
sample<-structure(list(Apr = structure(list(`04-2036` = structure(list(
date = c("04-25-2036", "04-26-2036"), value = c(1.14, 0.67
)), .Names = c("date", "value"), row.names = 116:117, class = "data.frame"),
`04-2037` = structure(list(date = c("04-22-2037", "04-23-2037"
), value = c(0, 0)), .Names = c("date", "value"), row.names = 478:479, class = "data.frame")), .Names = c("04-2036",
"04-2037")), Mar = structure(list(`03-2037` = structure(list(
date = c("03-21-2037", "03-22-2037"), value = c(1.67, 0)), .Names = c("date",
"value"), row.names = 446:447, class = "data.frame")), .Names = "03-2037"),
May = structure(list(`05-2036` = structure(list(date = c("05-25-2036",
"05-26-2036"), value = c(0, 2.31)), .Names = c("date", "value"
), row.names = 146:147, class = "data.frame")), .Names = "05-2036")), .Names = c("Apr",
"Mar", "May"))
Desired output: data will be grouped under same years.
$`2036`
$`2036`$`04-2036`
date value
116 04-25-2036 1.14
117 04-26-2036 0.67
$`2036`$`05-2036`
date value
146 05-25-2036 0.00
147 05-26-2036 2.31
$`2037`
$`2037`$`03-2037`
date value
446 03-21-2037 1.67
447 03-22-2037 0.00
$`2037`$`04-2037`
date value
478 04-22-2037 0
479 04-23-2037 0
structure of output will be like this one:
output<-structure(list(`2036` = structure(list(`04-2036` = structure(list(
date = c("04-25-2036", "04-26-2036"), value = c(1.14, 0.67
)), .Names = c("date", "value"), row.names = 116:117, class = "data.frame"),
`05-2036` = structure(list(date = c("05-25-2036", "05-26-2036"
), value = c(0, 2.31)), .Names = c("date", "value"), row.names = 146:147, class = "data.frame")), .Names = c("04-2036",
"05-2036")), `2037` = structure(list(`03-2037` = structure(list(
date = c("03-21-2037", "03-22-2037"), value = c(1.67, 0)), .Names = c("date",
"value"), row.names = 446:447, class = "data.frame"), `04-2037` = structure(list(
date = c("04-22-2037", "04-23-2037"), value = c(0, 0)), .Names = c("date",
"value"), row.names = 478:479, class = "data.frame")), .Names = c("03-2037",
"04-2037"))), .Names = c("2036", "2037"))

You can combine the data into one dataframe, extract the year from it and split.
library(dplyr)
library(purrr)
map_df(sample, bind_rows, .id = 'month') %>%
mutate(date = mdy(date),
year = year(date)) %>%
split(.$year) %>%
map(~split(.x, .x$month))
#$`2036`
#$`2036`$Apr
# month date value year
#1 Apr 2036-04-25 1.14 2036
#2 Apr 2036-04-26 0.67 2036
#$`2036`$May
# month date value year
#7 May 2036-05-25 0.00 2036
#8 May 2036-05-26 2.31 2036
#$`2037`
#$`2037`$Apr
# month date value year
#3 Apr 2037-04-22 0 2037
#4 Apr 2037-04-23 0 2037
#$`2037`$Mar
# month date value year
#5 Mar 2037-03-21 1.67 2037
#6 Mar 2037-03-22 0.00 2037

We can use base R
df1 <- transform(do.call(rbind, Map(cbind, Month = names(sample),
lapply(sample, function(x) do.call(rbind, x)))), date = as.Date(date, "%m-%d-%Y"))
df1$year <- format(df1$date, "%Y")
row.names(df1) <- NULL
lapply(split(df1, df1$year), function(x) split(x, x$Month))
$`2036`
$`2036`$Apr
Month date value year
1 Apr 2036-04-25 1.14 2036
2 Apr 2036-04-26 0.67 2036
$`2036`$May
Month date value year
7 May 2036-05-25 0.00 2036
8 May 2036-05-26 2.31 2036
$`2037`
$`2037`$Apr
Month date value year
3 Apr 2037-04-22 0 2037
4 Apr 2037-04-23 0 2037
$`2037`$Mar
Month date value year
5 Mar 2037-03-21 1.67 2037
6 Mar 2037-03-22 0.00 2037

Related

R:Sorting rows with time within multiple time interval

I want to pick up rows of which time data is between multiple intervals.
The data frame is like this:
dputs
structure(list(ID = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B"), score_time = c("2022/09/01 9:00:00", "2022/09/02 18:00:00",
"2022/09/03 12:00:00", NA, NA, "2022/09/15 18:00:00", "2022/09/18 20:00:00",
NA, NA, NA), score = c(243, 232, 319, NA, NA, 436, 310, NA, NA,
NA), treatment_start = c(NA, NA, NA, "2022/09/02 8:00:00", "2022/09/03 11:00:00",
NA, NA, "2022/09/15 8:00:00", "2022/09/16 14:00:00", "2022/09/16 23:00:00"
), treatment_end = c(NA, NA, NA, "2022/09/02 22:00:00", "2022/09/09 12:00:00",
NA, NA, "2022/09/16 2:00:00", "2022/09/16 22:00:00", "2022/09/17 0:00:00"
)), row.names = c(NA, -10L), spec = structure(list(cols = list(
ID = structure(list(), class = c("collector_character", "collector"
)), score_time = structure(list(), class = c("collector_character",
"collector")), score = structure(list(), class = c("collector_double",
"collector")), treatment_start = structure(list(), class = c("collector_character",
"collector")), treatment_end = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x6000000190b0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
ID score_time score treatment_start treatment_end
<chr> <chr> <dbl> <chr> <chr>
1 A 2022/09/01 9:00:00 243 NA NA
2 A 2022/09/02 18:00:00 232 NA NA
3 A 2022/09/03 12:00:00 319 NA NA
4 A NA NA 2022/09/02 8:00:00 2022/09/02 22:00:00
5 A NA NA 2022/09/03 11:00:00 2022/09/09 12:00:00
6 B 2022/09/15 18:00:00 436 NA NA
7 B 2022/09/18 20:00:00 310 NA NA
8 B NA NA 2022/09/15 8:00:00 2022/09/16 2:00:00
9 B NA NA 2022/09/16 14:00:00 2022/09/16 22:00:00
10 B NA NA 2022/09/16 23:00:00 2022/09/17 0:00:00
Multiple score values are given for each ID with the measurement time.
And each ID has more than one information of treatment duration shown by start and end time.
My target is score values that are measured during treatment periods.
I tried with the package lubridate and tidyverse to mutate intervals but could not apply "%in%" method.
Here is my attempt until putting intervals in the same rows with score values.
data %>%
mutate(trt_interval = interval(start = treatment_start, end = treatment_end)) %>%
group_by(ID) %>%
mutate(num = row_number()) %>%
pivot_wider(names_from = num, names_prefix = "intvl", values_from = trt_interval) %>%
fill(c(intvl1:last_col()), .direction = "up")
Desired output is like this.
(The first score of A and the last score of B dismissed because their score_time are out of interval.)
ID score
<chr> <dbl>
1 A 232
2 A 319
3 B 436
I want to know the smarter way to put data in a row and how to apply "%in%" for multiple intervals.
Sorry that the question is not qualified and include multiple steps but any advices will be a great help for me.
Hi I would first create two seperate data frames. One for the scores and one for the intervalls. Then would I join them both and filter the score that are within an treatment intervall.
data_score <- data %>%
filter(!is.na(score_time)) %>%
select(-starts_with("treat")) %>%
mutate(score_time = ymd_hms(score_time))
data_score
data_interval <- data %>%
filter(is.na(score_time)) %>%
select(ID,starts_with("treat")) %>%
mutate(trt_interval = interval(start = treatment_start, end = treatment_end))
data_score %>%
inner_join(
data_interval
) %>%
filter(
lubridate::`%within%`(score_time,trt_interval )
)
Hope this helps!!

R: replace value of columns for other columns based on condition

I have a dataframe with the following structure:
Timestamp Value1 Value2 Problem1 Problem2
00:00 32 40 No No
00:05 12 55 No No
00:10 14 42 Yes No
00:15 50 33 Yes No
00:20 78 47 No No
Where Problem1 defines if there is a problem with Value1, and Problem2 defines if there is a problem with Value2. In case of having a Yes in Problem1, I'd need to replace Value1 by Value2. In case of having problems in both, they should keep unchanged.
My problem here is that I won't know how many Value and Problem columns I'll have. So, in case of having more than 2, I'd need to replace the value with problems by the average of those values without problems.
So, in another example:
Timestamp Value1 Value2 Value3 Problem1 Problem2 Problem3
00:00 32 40 45 No No No
00:05 12 55 48 No No No
00:10 14 42 55 Yes No No
00:15 50 33 13 Yes No Yes
00:20 78 47 75 No No No
Here I'd need to replace Value1 at 00:10 by the average of Value2 and Value3. Also, I'd need to replace Value1 and Value3 at 00:15 by Value2.
I bet there is a more elegant solution.
library(tidyr)
library(dplyr)
df %>%
mutate(across(starts_with("Problem"), ~ .x == "Yes")) %>%
pivot_longer(-Timestamp, names_to = c("name", "id"), names_pattern = "(.*)(\\d+)") %>%
pivot_wider() %>%
group_by(Timestamp) %>%
mutate(Value = case_when(sum(Problem) == 0 | sum(Problem) == n() | !Problem ~ Value,
TRUE~ sum(Value * (1 - Problem))/sum(1-Problem))) %>%
pivot_longer(cols=c("Value", "Problem")) %>%
mutate(name = paste0(name,id), .keep="unused") %>%
pivot_wider() %>%
ungroup() %>%
mutate(across(starts_with("Problem"), ~ ifelse(.x == 1, "Yes", "No")))
returns
# A tibble: 5 x 7
Timestamp Value1 Problem1 Value2 Problem2 Value3 Problem3
<time> <dbl> <chr> <dbl> <chr> <dbl> <chr>
1 00'00" 32 No 40 No 45 No
2 05'00" 12 No 55 No 48 No
3 10'00" 48.5 Yes 42 No 55 No
4 15'00" 33 Yes 33 No 33 Yes
5 20'00" 78 No 47 No 75 No
What approach did I use?
Transform your Problem Variable into a boolean. R is able to use booleans in calculations, technically it is transformed later into a double.
Turn your value/problem numbers into a id, so for every timestamp there are several rows for Value and Problem.
Calculate the new value based on the number of problems and if the value is problematic.
Restore the shape of your data.frame.
Data
df <- structure(list(Timestamp = structure(c(0, 300, 600, 900, 1200
), class = c("hms", "difftime"), units = "secs"), Value1 = c(32,
12, 14, 50, 78), Value2 = c(40, 55, 42, 33, 47), Value3 = c(45,
48, 55, 13, 75), Problem1 = c("No", "No", "Yes", "Yes", "No"),
Problem2 = c("No", "No", "No", "No", "No"), Problem3 = c("No",
"No", "No", "Yes", "No")), problems = structure(list(row = 5L,
col = "Problem3", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(Timestamp = structure(list(format = ""), class = c("collector_time",
"collector")), Value1 = structure(list(), class = c("collector_double",
"collector")), Value2 = structure(list(), class = c("collector_double",
"collector")), Value3 = structure(list(), class = c("collector_double",
"collector")), Problem1 = structure(list(), class = c("collector_character",
"collector")), Problem2 = structure(list(), class = c("collector_character",
"collector")), Problem3 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
I use the data.table package. I call your data given in the second example "din".
I hope this code helps you:
#I use the library data.table; get data in data.table format
library(data.table)
din <- data.table(din)
din[,Value1:=as.numeric(Value1)]
din[,Value2:=as.numeric(Value2)]
din[,Value3:=as.numeric(Value3)]
#set Values to NA if there is a Problem
din[Problem1=="Yes", Value1:=NA]
din[Problem2=="Yes", Value2:=NA]
din[Problem3=="Yes", Value3:=NA]
#print table with NA replaced if we have a Problem
#print(din)
# Timestamp Value1 Value2 Value3 Problem1 Problem2 Problem3
#1: 00:00 32 40 45 No No No
#2: 00:05 12 55 48 No No No
#3: 00:10 NA 42 55 Yes No No
#4: 00:15 NA 33 NA Yes No Yes
#5: 00:20 78 47 75 No No No
#use the mean function to replace if I have an NA in the table (just working if Timestamp is a unique id, otherwise you need to generate one and use this in the by argument)
din[is.na(Value1), Value1:=mean(c(Value2,Value3), na.rm=T), by=Timestamp]
din[is.na(Value2), Value2:=mean(c(Value1,Value2), na.rm=T), by=Timestamp]
din[is.na(Value3), Value3:=mean(c(Value1,Value2), na.rm=T), by=Timestamp]
#print final table
#print(din)
# Timestamp Value1 Value2 Value3 Problem1 Problem2 Problem3
#1: 00:00 32.0 40 45 No No No
#2: 00:05 12.0 55 48 No No No
#3: 00:10 48.5 42 55 Yes No No
#4: 00:15 33.0 33 33 Yes No Yes
#5: 00:20 78.0 47 75 No No No
``
I made a sample for the data:
df <- data.frame(value1 = runif(10, min = 0, max = 100),
value2 = runif(10, min = 0, max = 100),
value3 = runif(10, min = 0, max = 100))
df_problem <- data.frame(problem1 = sample(c('yes','no'), 10, replace = T),
problem2 = sample(c('yes','no'), 10, replace = T),
problem3 = sample(c('yes','no'), 10, replace = T))
See that I separated the values from the problems. Then:
df_problem[df_problem == 'yes'] <- 1
df_problem[df_problem == 'no'] <- NA
df_problem <- matrix(as.numeric(unlist(df_problem)), nrow = nrow(df)) #rebuild matrix
Finally:
df <- df * df_problem
for (i in 1:nrow(df)){
if (T %in% is.na(df[i,])){
df[i,c(which(is.na(df[i,])))] <- mean(unlist(df[i,]), na.rm = T)
}
}
df

How to use the function slice correctly

I was looking to use slice, and this example here seemed to be most applicable. However, when I tried this for my data in didn't work but I'm not sure what I am missing.
Data
library(tidyverse)
library(lubridate)
# Data
my_df <- structure(list(Date = structure(c(17896, 17986, 18077, 18160), class = "Date"),
t_period = c("Quarter", "Quarter", "Quarter", "Quarter"),
A = c(-10.2, 9.4, 3.2, 0.7), B = c(-13.6, 13.4, 6.2, 2.8)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), groups = structure(list(
Date = structure(c(17896, 17986, 18077, 18160), class = "Date"),
.rows = list(1L, 2L, 3L, 4L)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
# my_df
# A tibble: 4 x 4
# Groups: Date [4]
# Date t_period A B
# <date> <chr> <dbl> <dbl>
#1 2018-12-31 Quarter -10.2 -13.6
#2 2019-03-31 Quarter 9.4 13.4
#3 2019-06-30 Quarter 3.2 6.2
#4 2019-09-21 Quarter 0.7 2.8
What I want to do is filter out the last quarter date if it is before the quarter end.
# Get end of existing quarter
end_of_qtr <- ceiling_date(as.Date("2019-09-28"), "quarter") - 1
# Filter out last row if Date < quarter end date
filtered_df <- my_df %>%
group_by(t_period) %>%
arrange(Date) %>%
slice(ifelse( n() < end_of_qtr, 1:(n()-1), n() ))
However, this does not return the desired result, which would be the first 3 rows of my_data since the last date ("2019-09-21" is before the end_of_qtr date "2019-09-30")
Could someone tell me the correct way to use slice to do this please?

Solution for repeating values in a given date range

Error in seq.Date(as.Date(retail$Valid_from), as.Date(retail$Valid_to), :
'from' must be of length 1
I have tried both the methods as mentioned in the question :
How should I deal with 'from' must be of length 1 error?
I basically want to repeat the quantity for each day in a given date range :
HSD_RSP Valid_from Valid_to
70 1/1/2018 15/1/2018
80 1/16/2018 1/31/2018
.
.
.
Method 1 :
byDay = ddply(retail, .(HSD_RSP), transform,
day=seq(as.Date(retail$Valid_from), as.Date(retail$Valid_to), by="day"))
Method 2 :
dt <- data.table(retail)
dt <- dt[,seq(as.Date(Valid_from),as.Date(Valid_to),by="day"),
by=list(HSD_RSP)]
HSD_RSP final_date
70 1/1/2018
70 2/1/2018
70 3/1/2018
70 4/1/2018
.
.
.
output of
dput(head(retail))
structure(list(HSD_RSP = c(61.68, 62.96, 63.14, 60.51, 60.34,
61.63), Valid_from = structure(c(1483315200, 1484524800, 1487116800,
1491004800, 1491523200, 1492300800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Valid_to = structure(c(1484438400, 1487030400,
1490918400, 1491436800, 1492214400, 1493510400), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Convert to date, create a sequence of dates between Valid_from and Valid_to and unnest
library(tidyverse)
df %>%
mutate_at(vars(starts_with("Valid")), as.Date, "%m/%d/%Y") %>%
mutate(Date = map2(Valid_from, Valid_to, seq, by = "1 day")) %>%
unnest(Date) %>%
select(-Valid_from, -Valid_to)
# HSD_RSP Date
# <int> <date>
# 1 70 2018-01-01
# 2 70 2018-01-02
# 3 70 2018-01-03
# 4 70 2018-01-04
# 5 70 2018-01-05
# 6 70 2018-01-06
# 7 70 2018-01-07
# 8 70 2018-01-08
# 9 70 2018-01-09
#10 70 2018-01-10
# … with 21 more rows
data
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))
Using Ronak Shah's data structure, using data.table:
library(data.table)
dt <- as.data.table(df1)
dt[, .(final_date = seq(as.Date(Valid_from, "%m/%d/%Y"), as.Date(Valid_to, "%m/%d/%Y"), by = "day")),
by = HSD_RSP]
HSD_RSP final_date
1: 70 2018-01-01
2: 70 2018-01-02
3: 70 2018-01-03
4: 70 2018-01-04
....
data:
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))

r ifelse condition for the calculation on multiple dataframes

I have 3 data frames, df1 = a time interval, df2 = list of IDs, df3 = list of IDs with associated date.
df1 <- structure(list(season = structure(c(2L, 1L), .Label = c("summer",
"winter"), class = "factor"), mindate = structure(c(1420088400,
1433131200), class = c("POSIXct", "POSIXt")), maxdate = structure(c(1433131140,
1448945940), class = c("POSIXct", "POSIXt")), diff = structure(c(150.957638888889,
183.040972222222), units = "days", class = "difftime")), .Names = c("season",
"mindate", "maxdate", "diff"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(ID = c(23796, 23796, 23796)), .Names = "ID", row.names = c(NA,
-3L), class = "data.frame")
df3 <- structure(list(ID = c("23796", "123456", "12134"), time = structure(c(1420909920,
1444504500, 1444504500), class = c("POSIXct", "POSIXt"), tzone = "US/Eastern")), .Names = c("ID",
"time"), row.names = c(NA, -3L), class = "data.frame")
The code should compare if df2$ID == df3$ID. If true, and if df3$time >= df1$mindate and df3$time <= df1$maxdate, then df1$maxdate - df3$time, else df1$maxdate - df1$mindate. I tried using the ifelse function. This works when i manually specify specific cells, but this is not what i want as I have many more (uneven rows) for each of the dfs.
df1$result <- ifelse(df2[1,1] == df3[1,1] & df3[1,2] >= df1$mindate & df3[1,2] <= df1$maxdate,
difftime(df1$maxdate,df3[1,2],units="days"),
difftime(df1$maxdate,df1$mindate,units="days")
EDIT: The desired output is (when removing last row of df2):
season mindate maxdate diff result
1 winter 2015-01-01 2015-05-31 23:59:00 150.9576 days 141.9576
2 summer 2015-06-01 2015-11-30 23:59:00 183.0410 days 183.0410
Any ideas? I don't see how I could merge dfs to make them of the same length. Note that df2 can be of any row length and not affect the code. Issues arise when df1 and df3 differ in # of rows.
The > and < are vectorized:
transform(df1,result=ifelse(df3$ID%in%df2$ID & df3$time>mindate & df3$time <maxdate, difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410
You can also use the between function from data.table library
library(data.table)
transform(df1,result=ifelse(df3$ID%in%df2$ID&df3$time%between%df1[2:3],
difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410

Resources