date val cal_val
1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14) (0_04),(1_08),(0_12),(1_14)
1/12/2017 0:30 (0_22),(0_25),(1_29) (0_22),(1_29)
1/12/2017 0:45 (1_34),(1_38),(0_40),(1_44) (1_38),(0_40),(1_44)
1/12/2017 1:00 (1_47),(1_49),(1_53),(1_57),(0_59) (1_57),(0_59)
1/12/2017 1:15 (0_07),(0_09),(0_10),(0_13),(1_14) (0_7),(1_14)
How to search every single char after special char "(" and if they are consecutive or frozen
value with "0" then then consider min of value just after "_" else if it is "1" consider from max position , if there is no consecutive values the it remains
same.
i.e in row_1 : there is no consecutive values.
row_2 : (0_22),(0_25) are consecutive then consider min i.e (0_22) and later
row_3 : (1_34),(1_38) are consecutive then consider max i.e (1_38) and later
row_4 : (0_07),(0_09),(0_10),(0_13),(1_14) are consecutive then consider min i.e (0_7) and later
Thanks in advance.
Here's a tidyverse solution:
You can use stringr functions to pull out the 0-matching and 1-matching cases separately, and then combine them after applying min/max as specified:
df %>%
rowwise() %>%
mutate(
zero = min(
as.numeric(
str_extract_all(
str_extract(val, "(\\(0_\\d+\\),){2,}"), # find 0-consecutives
"\\d{2}")[[1]])), # pull out the 2-digit values
one = max(
as.numeric(
str_extract_all(
str_extract(val, "(\\(1_\\d+\\),){2,}"), # find 1-consecutives
"\\d{2}")[[1]])),
final = sum(zero, one, na.rm=TRUE))
# A tibble: 5 x 5
date val zero one final
<chr> <chr> <dbl> <dbl> <dbl>
1 1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14) NA NA 0.
2 1/12/2017 0:30 (0_22),(0_25),(1_29) 22. NA 22.
3 1/12/2017 0:45 (1_34),(1_38),(0_40),(1_44) NA 38. 38.
4 1/12/2017 1:00 (1_47),(1_49),(1_53),(1_57… NA 57. 57.
5 1/12/2017 1:15 (0_07),(0_09),(0_10),(0_13… 7. NA 7.
Another approach could be
library(tidyverse)
library(data.table)
#prepare data to count consecutive 0 or 1
df1 <- df %>%
mutate(val = gsub("[()]", "", val)) %>%
separate_rows(val, sep = ",") %>%
separate("val", c("val_pre", "val_post"))
#identify consecutive 0 or 1 - TRUE in 'flag' column indicates consecutive 0 or 1
setDT(df1)[, seq_ind := seq(.N), by = .(date_col, rleid(val_pre))
][, flag := shift(seq_ind, type="lead",) > 1 | seq_ind > 1, by = date_col]
#filter consecutive rows. In there zero's repetition is replaced with min value & 1's repetition with max value
df2 <- setDF(df1) %>%
filter(flag == T) %>%
group_by(date_col, val_pre) %>%
mutate(val_post = ifelse(val_pre == 0, min(val_post), max(val_post))) %>%
#row-bind non-consecutive rows as is
bind_rows(setDF(df1) %>% filter(flag == F | is.na(flag))) %>%
select(-seq_ind, -flag) %>%
distinct() %>%
mutate(cal_val = paste0("(", val_pre, "_", val_post, ")")) %>%
group_by(date_col) %>%
summarise(cal_val = paste(cal_val, collapse = ","))
which gives
df2
date_col cal_val
1 1/12/2017 0:15 (0_04),(1_08),(0_12),(1_14)
2 1/12/2017 0:30 (0_22),(1_29)
3 1/12/2017 0:45 (1_38),(0_40),(1_44)
4 1/12/2017 1:00 (1_57),(0_59)
5 1/12/2017 1:15 (0_07),(1_14)
Sample data:
df <- structure(list(date_col = c("1/12/2017 0:15", "1/12/2017 0:30",
"1/12/2017 0:45", "1/12/2017 1:00", "1/12/2017 1:15"), val = c("(0_04),(1_08),(0_12),(1_14)",
"(0_22),(0_25),(1_29)", "(1_34),(1_38),(0_40),(1_44)", "(1_47),(1_49),(1_53),(1_57),(0_59)",
"(0_07),(0_09),(0_10),(0_13),(1_14)")), .Names = c("date_col",
"val"), class = "data.frame", row.names = c(NA, -5L))
Related
I have a df here (the desired output, my starting df does not have the Flag variable):
df <- data.frame(
Person = c('1','2','3'),
Date = as.Date(c('2010-09-30', '2012-11-20', '2015-03-11')),
Treatment_1 = as.Date(c('2010-09-30', '2012-11-21', '2015-03-22')),
Treatment_2 = as.Date(c('2011-09-30', 'NA', '2011-03-22')),
Treatment_3 = as.Date(c('2012-09-30', '2015-11-21', '2015-06-22')),
Surgery_1 = as.Date(c(NA, '2016-11-21', '2015-03-12')),
Surgery_2 = as.Date(c(NA, '2017-11-21', '2019-03-12')),
Surgery_3 = as.Date(c(NA, '2018-11-21', '2013-03-12')),
Flag = c('', 'Y', '')
)
and I want to derive the Flag variable based on these conditions:
For any column that starts with Treatment, set Flag to "" if Date = Treatment
For any column that starts with Surgery, set Flag to "" if Date = Surgery OR Date = Surgery +1 OR Date = Surgery - 1 (basically if the Surgery date is on the day, one day before, or one day after the Date variable, set Flag to "").
else set Flag = "Y"
I've looked into mutate_at but that rewrites the variables and assigns values of True/False.
This is wrong but this is my attempt:
df2 <- df %>%
mutate(Flag = case_when(
vars(starts_with("Treatment"), Date == . ) ~ '',
vars(starts_with("Surgery"), Date == . | Date == . - 1 | Date == . + 1) ~ '',
TRUE ~ 'Y')
)
UPDATE 2022-Aug-22
When I change a cell with the same date as the one in row 2:
df <- data.frame(
Person = c('1','2','3'),
Date = as.Date(c('2010-09-30', '2012-11-20', '2015-03-11')),
Treatment_1 = as.Date(c('2010-09-30', '2012-11-21', '2015-03-22')),
Treatment_2 = as.Date(c('2011-09-30', 'NA', '2011-03-22')),
Treatment_3 = as.Date(c('2012-09-30', '2015-11-21', '2015-06-22')),
Surgery_1 = as.Date(c(NA, '2016-11-21', '2015-03-12')),
Surgery_2 = as.Date(c(NA, '2017-11-21', '2019-03-12')),
Surgery_3 = as.Date(c(NA, '2018-11-21', '2012-11-20')),
Flag = c('', 'Y', '')
)
and then re-run the base R solution, the Flag in the second row is no longer "Y" but it should be as in that row, it doesn't meet any of the above conditions.
We can use rowwise and c_across along with any for each condition in case_when. Then, we can make a list for the Date (and +1, -1 days) for Surgery to match.
library(tidyverse)
df %>%
rowwise() %>%
mutate(Flag = case_when(
any(c_across(starts_with("Treatment")) == Date) ~ "",
any(c_across(starts_with("Surgery")) %in% c(Date, (Date +1), (Date-1))) ~ "",
TRUE ~ "Y"
))
Output
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 Flag
<chr> <date> <date> <date> <date> <date> <date> <date> <chr>
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 NA NA NA ""
2 2 2012-11-20 2012-11-21 NA 2015-11-21 2016-11-21 2017-11-21 2018-11-21 "Y"
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12 ""
Update
Here is a possible base R solution that is a lot quicker than tidyverse. This could be done in one line of code, but I decided that readability is better. First, I duplicate the Surgery columns so that we have +1 day and -1 day, and then convert these columns to character. Then, I subset the Treatment columns and convert to character. I convert to character as you cannot compare Date with %in% or ==. Then, I bind the date, treatment, and surgery columns together (a). Then, I use an ifelse for if the Date is in any of the columns but doing it row by row with apply, then we return "" and if not then return Y. Then, I bind the result back to the original dataframe (minus Flag from your original dataframe).
dup_names <- colnames(df)[startsWith(colnames(df), "Surgery")]
surgery <-
cbind(df[dup_names], setNames(df[dup_names] + 1, paste0(dup_names, "_range1")))
surgery <-
sapply(cbind(surgery, setNames(df[dup_names] - 1, paste0(
dup_names, "_range2"
))), as.character)
treatment <-
sapply(df[startsWith(colnames(df), "Treatment")], as.character)
a <- cbind(Date = as.character(df$Date), treatment, surgery)
cbind(subset(df, select = -Flag),
Flag = ifelse(apply(a[,1]==a[,2:ncol(a)], 1, any, na.rm = TRUE), "", "Y"))
Benchmark
Here is an alternative using across approach:
library(tidyverse)
df %>%
mutate(across(starts_with("Treatment"), ~as.numeric(. %in% Date), .names ="new_{.col}"),
across(starts_with("Surgery"), ~as.numeric(. %in% c(Date, Date+1, Date-1)), .names ="new_{.col}")) %>%
mutate(Flag = ifelse(rowSums(select(., contains('new')))==1, "", "Y"), .keep="used") %>%
bind_cols(df)
Flag Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2 Y 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12
Updated to add data.table approach
If you want a data.table approach, here it is:
df[melt(df, id=c(1,2))[,flag:=fifelse(
(str_starts(variable,"T") & value==Date) |
(str_starts(variable,"S") & abs(value-Date)<=1),"", "Y")][
, .(flag=min(flag,na.rm=T)), Person], on=.(Person)]
Output
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 flag
1: 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2: 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21 Y
3: 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12
I like Andrew's approach, but I was working on this when his answer came in, so here it is in case you are interested
df %>% inner_join(
pivot_longer(df, cols=Treatment_1:Surgery_3) %>%
mutate(flag=case_when(
(str_starts(name,"T") & value==Date) | (str_starts(name,"S") & abs(value-Date)<=1) ~ "",
TRUE ~"Y")) %>%
group_by(Person) %>%
summarize(flag = min(flag))
)
Output:
Person Date Treatment_1 Treatment_2 Treatment_3 Surgery_1 Surgery_2 Surgery_3 flag
1 1 2010-09-30 2010-09-30 2011-09-30 2012-09-30 <NA> <NA> <NA>
2 2 2012-11-20 2012-11-21 <NA> 2015-11-21 2016-11-21 2017-11-21 2018-11-21 Y
3 3 2015-03-11 2015-03-22 2011-03-22 2015-06-22 2015-03-12 2019-03-12 2013-03-12
I try to calculate the date difference between second row and last row per group id. The data looks like
data<- data.frame(pid= c(1, 1, 1,1, 2, 2, 2, 3, 3, 3,3 ,3), day = c("25/07/2018", "19/10/2018", "17/01/2019", "19/03/2019", "10/09/2018","29/11/2018", "26/03/2019", "17/06/2016", "25/04/2018", "17/07/2018","05/04/2019", "09/02/2021"), catt=c(1,1,2,1,1,1,2,2,2,1,1,2))
data
pid
day
1
1
25/07/2018
2
1
19/10/2018
3
1
17/01/2019
4
1
19/03/2019
5
2
10/09/2018
6
2
29/11/2018
7
2
26/03/2019
8
3
17/06/2016
9
3
25/04/2018
10
3
17/07/2018
11
3
05/04/2019
12
3
09/02/2021
I use the following code to obtain a difference in months.
difftime("19/10/2018","19/03/2019 ", units = "days")/ (30)
difftime("29/11/2018","26/03/2019 ", units = "days")/ (30)
difftime("25/04/2018","09/02/2021 ", units = "days")/ (30)
The desired output
id day difference
1 25/07/2018
1 19/10/2018
1 17/01/2019
1 19/03/2019 7.13
2 10/09/2018
2 29/11/2018
2 26/03/2019 44.7
3 17/06/2016
3 25/04/2018
3 17/07/2018
3 05/04/2019
3 09/02/2021 196.7667
But it is difficult to large data, so anyone can help using lubricate () + slice() functions
Convert to date object and calculate the difference between last and second date for each pid.
library(dplyr)
library(lubridate)
data %>%
mutate(day = dmy(day)) %>%
arrange(pid, day) %>%
group_by(pid) %>%
summarise(difference = (last(day) - day[2])/30)
# pid difference
# <dbl> <dbl>
#1 1 5.03
#2 2 3.9
#3 3 34.0
If you want to maintain the number of rows in the dataframe, use mutate and replace the difference only on the last row of the dataframe.
data %>%
mutate(day = dmy(day)) %>%
arrange(pid, day) %>%
group_by(pid) %>%
mutate(difference = ifelse(row_number() == n(), (last(day) - day[2])/30, NA))
Note that output from difftime in the question is incorrect.
#Wrong output
difftime("19/10/2018","19/03/2019 ", units = "days")
#Time difference of 214 days
#Correct output
difftime(dmy("19/03/2019"), dmy("19/10/2018"), units = "days")
#Time difference of 151 days
How can I get the difference between Date1 and Date2 columns of my dataframe?
Date1 Tfd Date2 Sex
13/08/1936 3 09/01/2013 M
25/04/1948 2 14/05/2014 M
26/01/1939 1 03/07/2015 F
13/02/1935 8 03/08/2012 F
I have tryed:
age<-apply(df[, c("Date1", "Date2")], function(x, y) difftime(strptime(y, format = "%d.%m.%Y"), strptime(x, format = "%d.%m.%Y"),units="years"))
but I get this error:
Error in strptime(y, format = "%d.%m.%Y") :
argument "y" is missing, with no default
Do you know how can I solve this?
You don't need apply here :
as.numeric(as.Date(df$Date2, "%d/%m/%Y") - as.Date(df$Date1, "%d/%m/%Y"))
#[1] 27908 24125 27917 28296
difftime does not have units as 'years'. The maximum units it has is of weeks. You can divide the week value with 52.25 to get year of use lubridate's time_length function.
Or using dplyr with difftime
library(dplyr)
library(lubridate)
df %>%
mutate_at(vars(starts_with('date')), lubridate::dmy) %>%
mutate(diff = time_length(difftime(Date2, Date1), 'years'))
# Date1 Tfd Date2 Sex diff
#1 1936-08-13 3 2013-01-09 M 76.4
#2 1948-04-25 2 2014-05-14 M 66.1
#3 1939-01-26 1 2015-07-03 F 76.4
#4 1935-02-13 8 2012-08-03 F 77.5
I have a data.frame that doesn't account for leap year (ie all years are 365 days). I would like to repeat the last day value in February during the leap year. The DF in my code below has fake data set, I intentionally remove the leap day value in DF_NoLeapday. I would like to add a leap day value in DF_NoLeapday by repeating the value of the last day of February in a leap year (in our example it would Feb 28, 2004 value). I would rather like to have a general solution to apply this to many years data.
set.seed(55)
DF <- data.frame(date = seq(as.Date("2003-01-01"), to= as.Date("2005-12-31"), by="day"),
A = runif(1096, 0,10),
Z = runif(1096,5,15))
DF_NoLeapday <- DF[!(format(DF$date,"%m") == "02" & format(DF$date, "%d") == "29"), ,drop = FALSE]
We can use complete on the 'date' column which is already a Date class to expand the rows to fill in the missing dates
library(dplyr)
library(tidyr)
out <- DF_NoLeapday %>%
complete(date = seq(min(date), max(date), by = '1 day'))
dim(out)
#[1] 1096 3
out %>%
filter(date >= '2004-02-28', date <= '2004-03-01')
# A tibble: 3 x 3
# date A Z
# <date> <dbl> <dbl>
#1 2004-02-28 9.06 9.70
#2 2004-02-29 NA NA
#3 2004-03-01 5.30 7.35
By default, the other columns values are filled with NA, if we need to change it to a different value, it can be done within complete with fill
If we need the previous values, then use fill
out <- out %>%
fill(A, Z)
out %>%
filter(date >= '2004-02-28', date <= '2004-03-01')
# A tibble: 3 x 3
# date A Z
# <date> <dbl> <dbl>
#1 2004-02-28 9.06 9.70
#2 2004-02-29 9.06 9.70
#3 2004-03-01 5.30 7.35
I am not good at "R" and not sure how to rearrange and subset time series data. Sorry, if this question sounds stupid.
I have a time series data of sea tide with four values per day (with missing values as well). Two values for high tide and two values for low tide. The time and date are given in the same column but in different rows. Now, I want to subset the data only for daytime (from 7:00 AM to 7:00 PM) not for night. Then I want to have data arranged in three columns only i) Date, ii) Time and iii) Tide. For Tide, I only need minimum and maximum values. Here is an example of the data and the desired arrangement of data. For each date, data is arranged in three rows similar to the example.
1/1/2011 Low High Low NA
Time 2:58 AM 9:38 AM 5:19 PM NA
Tide 1.2 m 2.2 m 0.6 m NA
1/2/2011 High Low High Low
Time 2:07 AM 4:22 AM 10:19 AM 6:07 PM
Tide 1.4 m 1.3 m 2.3 m 0.4 m
Date Time Tide
1/1/2011 17:19 0.6
1/1/2011 9:38 2.2
1/2/2011 2:07 1.4
1/2/2011 18:07 0.4
The input, DF is assumed to be as in the Note below.
g, the grouping vector, has one element per row of DF and is equal to c(1, 1, 1, 2, 2, 2, ...). Alternate ways to compute g would be n <- nrow(DF); g <- gl(n, 3, n) or n <- nrow(DF); g <- rep(1:3, n, n).
We then use by to split DF into groups and apply the indicated anonymous function to each group as defined by g.
The anonymous function combines the date and the times in the current group to create the date/times dt making use of the fact that the common date is x[1,1] and the times prior to being cleaned up are in x[2,-1].
Using dt and the tides in x[2, -1] (prior to being cleaned up) it computes each of the three columns arranging them into a data frame. Then there is a commented out line which removes NA values. If you want this uncomment it. Subset the data frame obtained so far to the 7am to 7pm time period and further take the two rows consisting of the min and max tide. We sort that by time.
Finally do.call("rbind", ...) puts the groups together into one overall data frame.
No packages are used.
g <- cumsum(grepl("\\d", DF$V1))
Long <- do.call("rbind", by(DF, g, function(x) {
dt <- as.POSIXct(paste(x[1,1], as.matrix(x[2, -1])), format = "%m/%d/%Y %I:%M %p")
X <- data.frame(Date = as.Date(dt),
Time = format(dt, "%H:%M"),
Tide = as.numeric(sub("m", "", as.matrix(x[3, -1]))),
stringsAsFactors = FALSE)
# X <- na.omit(X)
X <- subset(X, Time >= "07:00" & Time <= "19:00")
X <- X[c(which.min(X$Tide), which.max(X$Tide)), ]
X[order(X$Time), ]
}))
giving the following -- note that the third row in the question's output is not between 7am and 7pm so the output here necessarily differs.
> Long
Date Time Tide
1.2 2011-01-01 09:38 2.2
1.3 2011-01-01 17:19 0.6
2.3 2011-01-02 10:19 2.3
2.4 2011-01-02 18:07 0.4
Note: The input DF is assumed to be as follows in reproducible form:
Lines <- "1/1/2011,Low,High,Low,NA
Time,2:58 AM,9:38 AM,5:19 PM,NA
Tide,1.2 m,2.2 m,0.6 m,NA
1/2/2011,High,Low,High,Low
Time,2:07 AM,4:22 AM,10:19 AM,6:07 PM
Tide,1.4 m,1.3 m,2.3 m,0.4 m"
DF <- read.table(text = Lines, sep = ",", as.is = TRUE)
If the list is not too long, this endeavour would be simpler to do in a spreadsheet simply by mapping cells and filtering. But one way to do it in R with zoo and tidyverse is the following:
Assuming that the original dataframes have their columns named as C1:C5
C1 C2 C3 C4 C5
<chr> <chr> <chr> <chr> <chr>
1 1/1/2010 Low High Low <NA>
2 Time 2:58 AM 9:38 AM 5:19 PM <NA>
3 Tide 1.2 2.2 0.6 <NA>
4 1/2/2011 High Low High Low
5 Time 2:07 AM 4:22 AM 10:19 AM 6:07 PM
6 Tide 1.4 1.3 2.3 0.4
DF <- DF %>%
mutate(Date = as.Date(gsub("Tide|Time","", C1), format = "%d/%m/%Y"))
DF <- DF %>%
mutate(Date = na.locf(DF$Date, na.rm = TRUE),
C1 = gsub("[[:digit:]]|\\/", "", C1),
Type = if_else(nchar(C1) == 0, "TideType", C1)) %>%
select(Date, Type, C2:C5) %>%
gather(oColumn, Value, -c(Date, Type)) %>%
spread(key = Type, value = Value) %>%
select(Date, Time, Tide) %>%
filter(complete.cases(.))
DF <- DF %>%
mutate(Time = ymd_hm(paste(DF$Date, DF$Time, sep = " ")),
Tide = as.numeric(Tide))
DF <- DF %>%
mutate(DayNight = (DF$Time) %within%
interval(as.POSIXlt(DF$Date) + (7*60*60), as.POSIXlt(DF$Date) + (19*60*60))) %>%
filter(DayNight == TRUE) %>%
select(-DayNight) ) %>%
group_by(Date) %>%
filter(Tide == max(Tide) | min(Tide))
DF
Source: local data frame [4 x 3]
Groups: Date [2]
Date Time Tide
<date> <dttm> <dbl>
1 2010-01-01 2010-01-01 09:38:00 2.2
2 2010-01-01 2010-01-01 17:19:00 0.6
3 2011-02-01 2011-02-01 10:19:00 2.3
4 2011-02-01 2011-02-01 18:07:00 0.4
Note that "Date" is a Date type of Object and "Time" is a Posixct type of Date-Time Object. You might want to convert "Time" into a vector of minutes.