Related
I'm QCing data and for several tanks/data_types there are faulty data that need to be removed, spanning multiple time ranges. The data_types, tanks and time ranges that contain faulty data have been reported in a separate data frame, a snippet of which is contained in this QC_table:
structure(list(trial = c(1L, 1L, 1L, 1L, 2L, 2L), data_type = c("Temp",
"pH", "pH", "pH", "Temp", "Temp"), tank = c("29", "40", "40",
"40", "13", "29"), date_time_start = c("2021-03-31 8:30", "2021-03-31 7:50",
"2021-03-31 10:25", "2021-03-31 17:05", "2021-04-07 10:25", "2021-04-08 10:30"
), date_time_end = c("2021-03-31 18:00", "2021-03-31 8:15", "2021-03-31 10:40",
"2021-03-31 17:30", "2021-04-07 17:20", "2021-04-10 18:25"),
to.be.removed = c("yes ", "yes ", "yes ", "yes ", "yes ",
"yes "), reason = c("calibration error", "faulty probe",
"faulty probe", "faulty probe", "calibration error", "faulty probe"
), data_type_tank = c("WalchemTempTank29", "pH_Tank40", "pH_Tank40",
"pH_Tank40", "WalchemTempTank13", "WalchemTempTank29")), row.names = c(NA,
-6L), class = "data.frame")
There are additional trials and tanks to this. My approach to this was to create a new dataframe with all the data that needs to be removed (based on data_type_tank and date_time_start/end columns in the QC table), and then remove that data frame from the original dataframe. I don't know if this is the most logical, but I wouldn't know how I would be able to remove the data from the original dataframe.
I construct a new dataframe using:
new_dataframe <- dataframe %>%
select(c(Measurement.time, Trial, contains(urchin_temp_pH_QC$data_type_tank))) %>% head(10)
structure(list(Measurement.time = c("2021-03-30 11:00", "2021-03-30 11:05",
"2021-03-30 11:10", "2021-03-30 11:15", "2021-03-30 11:20", "2021-03-30 11:25",
"2021-03-30 11:30", "2021-03-30 11:35", "2021-03-30 11:40", "2021-03-30 11:45"
), Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WalchemTempTank29_Avg = c("18.8",
"18.67", "18.58", "18.48", "18.38", "18.29", "18.2", "18.12",
"18.03", "18"), WalchemTempTank29_Std = c("0.037", "0.025", "0.032",
"0.029", "0.017", "0.018", "0.026", "0.025", "0.024", "0.023"
), pH_Tank40_Avg = c("7.859", "7.863", "7.868", "7.866", "7.863",
"7.864", "7.865", "7.867", "7.869", "7.87"), pH_Tank40_Std = c("0.007",
"0.006", "0.002", "0.001", "0.002", "0.001", "0.002", "0.001",
"0.002", "0.004"), WalchemTempTank13_Avg = c("10.26", "10.22",
"10.21", "10.24", "10.27", "10.3", "10.32", "10.34", "10.37",
"10.4"), WalchemTempTank13_Std = c("0.01", "0.013", "0.005",
"0.01", "0.007", "0.006", "0.006", "0.008", "0.008", "0.005")), row.names = 4:13, class = "data.frame")
However now, based on the QC table there are some rows (date/time) that I need to remove or subset for, but only for specific columns (ie. those columns that contain the data_type_tank). I think I can do this manually, using the code below, and then binding or joining columns/rows where needed, but this seems like an arduous process.
subset_row_1_QC_table <- dataframe %>% select(Measurement.time, contains("WalchemTempTank29")) %>%
subset(Measurement.time >= as.POSIXct("2021-03-31 08:30") & Measurement.time <= as.POSIXct("2021-03-31 18:00"))
Is there any way to automate this process, removing or subsetting column-specific rows, based on columns from a different data frame? I think ideally my dataframe would look something like an expanded version of eg:
Measurement.time
Trial
WaterTempTank29_Avg
WaterTempTank29_Std
pH_Tank40_Avg
pH_Tank40_Std
2021-03-31 08:30
1
18.8
0.037
NA
NA
[all 5-min intervals]
NA
NA
2021-03-31 18:00
1
18.36
0.023
NA
NA
2021-03-31 07:50
1
NA
NA
7.854
0.001
[all 5-min intervals]
1
NA
NA
7.88
0.001
2021-03-31 08:15
1
NA
NA
7.84
0.001
2021-03-31 10:25
1
NA
NA
7.881
0.001
[all 5-min intervals]
1
NA
NA
7.804
0.001
2021-03-31 10:40
1
NA
NA
7.881
0
Any help would be greatly appreciated! I hope I've been able to explain properly my problem, first-time user of StackOverflow.
Cheers,
edit: Thanks r2evans & GuedesBF -- hope this is now better/fixed.
I would do it like this.
Prepare data for variables CF and df
CF = structure(list(trial = c(1L, 1L, 1L), data_type = c("Temp", "pH",
"pH"), tank = c("29", "40", "40"), date_time_start = structure(c(1617204600,
1617202200, 1617211500), tzone = "", class = c("POSIXct", "POSIXt"
)), date_time_end = structure(c(1617238800, 1617203700, 1617212400
), tzone = "", class = c("POSIXct", "POSIXt")), to.be.removed = c("yes ",
"yes ", "yes "), reason = c("calibration error", "faulty probe",
"faulty probe"), data_type_tank = c("WalchemTempTank29", "pH_Tank40",
"pH_Tank40")), row.names = c(NA, -3L), groups = structure(list(
data_type = c("pH", "Temp"), tank = c("40", "29"), .rows = structure(list(
2:3, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = 1:2, class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df = structure(list(Measurement.time = c("2021-03-30 11:00", "2021-03-30 11:05",
"2021-03-30 11:10", "2021-03-30 11:15", "2021-03-31 18:30", "2021-03-30 11:25",
"2021-03-30 11:30", "2021-03-30 11:35", "2021-03-31 17:00", "2021-03-31 19:28"
), Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WalchemTempTank29_Avg = c("18.8",
"18.67", "18.58", "18.48", "18.38", "18.29", "18.2", "18.12",
"18.03", "18"), WalchemTempTank29_Std = c("0.037", "0.025", "0.032",
"0.029", "0.017", "0.018", "0.026", "0.025", "0.024", "0.023"
), pH_Tank40_Avg = c("7.859", "7.863", "7.868", "7.866", "7.863",
"7.864", "7.865", "7.867", "7.869", "7.87"), pH_Tank40_Std = c("0.007",
"0.006", "0.002", "0.001", "0.002", "0.001", "0.002", "0.001",
"0.002", "0.004"), WalchemTempTank13_Avg = c("10.26", "10.22",
"10.21", "10.24", "10.27", "10.3", "10.32", "10.34", "10.37",
"10.4"), WalchemTempTank13_Std = c("0.01", "0.013", "0.005",
"0.01", "0.007", "0.006", "0.006", "0.008", "0.008", "0.005")), row.names = 4:13, class = "data.frame")
Note, I changed some data in the variable Measurement.time to include the events in the CF table.
Prepare separate tables CFTemp and CFpH
CFTemp = CF %>% ungroup() %>%
filter(data_type == "Temp") %>%
mutate(Temp = "Temp",
Temp_start = date_time_start,
Temp_end = date_time_end) %>%
select(Temp, Temp_start, Temp_end)
CFpH = CF %>% ungroup() %>%
filter(data_type == "pH") %>%
mutate(pH = "pH",
pH_start = date_time_start,
pH_end = date_time_end) %>%
select(pH, pH_start, pH_end)
Prepare two functions returning vectors with binary values and for which indices data should be removed.
fTemp = function(df) CFTemp %>% left_join(df, by="Temp") %>%
mutate(TempRm = Measurement.time>=Temp_start & Measurement.time<=Temp_end) %>%
group_by(ID) %>%
summarise(TempRm = any(TempRm)) %>%
pull(TempRm)
fpH = function(df) CFpH %>% left_join(df, by="pH") %>%
mutate(pHRm = Measurement.time>=pH_start & Measurement.time<=pH_end) %>%
group_by(ID) %>%
summarise(pHRm = any(pHRm)) %>%
pull(pHRm)
Convert the data frame
df1 = df %>% as_tibble() %>%
mutate(Measurement.time = as.POSIXct(Measurement.time),
ID = 1:nrow(.),
Temp = "Temp",
pH = "pH") %>%
mutate(
TmpRm = fTemp(.),
pHRm = fpH(.)
) %>%
mutate(
WalchemTempTank29_Avg = ifelse(TmpRm, NA, WalchemTempTank29_Avg),
WalchemTempTank29_Std = ifelse(TmpRm, NA, WalchemTempTank29_Std),
WalchemTempTank13_Avg = ifelse(TmpRm, NA, WalchemTempTank13_Avg),
WalchemTempTank13_Std = ifelse(TmpRm, NA, WalchemTempTank13_Std),
pH_Tank40_Avg = ifelse(pHRm, NA, pH_Tank40_Avg),
pH_Tank40_Std = ifelse(pHRm, NA, pH_Tank40_Std),
) %>%
select(Measurement.time:WalchemTempTank13_Std)
df1
output
# A tibble: 10 x 8
Measurement.time Trial WalchemTempTank29_Avg WalchemTempTank29_Std pH_Tank40_Avg pH_Tank40_Std WalchemTempTank13_Avg WalchemTempTank13_Std
<dttm> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 2021-03-30 11:00:00 1 18.8 0.037 7.859 0.007 10.26 0.01
2 2021-03-30 11:05:00 1 18.67 0.025 7.863 0.006 10.22 0.013
3 2021-03-30 11:10:00 1 18.58 0.032 7.868 0.002 10.21 0.005
4 2021-03-30 11:15:00 1 18.48 0.029 7.866 0.001 10.24 0.01
5 2021-03-31 18:30:00 1 NA NA 7.863 0.002 NA NA
6 2021-03-30 11:25:00 1 18.29 0.018 7.864 0.001 10.3 0.006
7 2021-03-30 11:30:00 1 18.2 0.026 7.865 0.002 10.32 0.006
8 2021-03-30 11:35:00 1 18.12 0.025 7.867 0.001 10.34 0.008
9 2021-03-31 17:00:00 1 18.03 0.024 NA NA 10.37 0.008
10 2021-03-31 19:28:00 1 NA NA NA NA NA NA
And that's all.
Update 1
library(tidyverse)
CFTemp = CF %>% ungroup() %>%
filter(data_type == "Temp") %>%
mutate(Temp = "Temp",
Temp_start = date_time_start,
Temp_end = date_time_end) %>%
select(Temp, tank, Temp_start, Temp_end)
CFpH = CF %>% ungroup() %>%
filter(data_type == "pH") %>%
mutate(pH = "pH",
pH_start = date_time_start,
pH_end = date_time_end) %>%
select(pH, pH_start, pH_end)
fTemp = function(df, Tank){
out = CFTemp %>% filter(tank==Tank) %>%
left_join(df, by="Temp") %>%
mutate(TempRm = Measurement.time>=Temp_start & Measurement.time<=Temp_end) %>%
group_by(ID) %>%
summarise(TempRm = any(TempRm)) %>%
pull(TempRm)
if(length(out)==0) FALSE else out
}
fpH = function(df) CFpH %>% left_join(df, by="pH") %>%
mutate(pHRm = Measurement.time>=pH_start & Measurement.time<=pH_end) %>%
group_by(ID) %>%
summarise(pHRm = any(pHRm)) %>%
pull(pHRm)
df1 = df %>% as_tibble() %>% #Step 1
mutate(Measurement.time = as.POSIXct(Measurement.time),
ID = 1:nrow(.),
Temp = "Temp",
pH = "pH") %>%
mutate( #Step 2
TmpRm29 = fTemp(., 29),
TmpRm13 = fTemp(., 13),
pHRm = fpH(.)
) %>%
mutate( #Step 3
WalchemTempTank29_Avg = ifelse(TmpRm29, NA, WalchemTempTank29_Avg),
WalchemTempTank29_Std = ifelse(TmpRm29, NA, WalchemTempTank29_Std),
WalchemTempTank13_Avg = ifelse(TmpRm13, NA, WalchemTempTank13_Avg),
WalchemTempTank13_Std = ifelse(TmpRm13, NA, WalchemTempTank13_Std),
pH_Tank40_Avg = ifelse(pHRm, NA, pH_Tank40_Avg),
pH_Tank40_Std = ifelse(pHRm, NA, pH_Tank40_Std),
) %>%
select(Measurement.time:WalchemTempTank13_Std)
df1
output
# A tibble: 10 x 8
Measurement.time Trial WalchemTempTank29_Avg WalchemTempTank29_Std pH_Tank40_Avg pH_Tank40_Std WalchemTempTank13_Avg WalchemTempTank13_Std
<dttm> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 2021-03-30 11:00:00 1 18.8 0.037 7.859 0.007 10.26 0.01
2 2021-03-30 11:05:00 1 18.67 0.025 7.863 0.006 10.22 0.013
3 2021-03-30 11:10:00 1 18.58 0.032 7.868 0.002 10.21 0.005
4 2021-03-30 11:15:00 1 18.48 0.029 7.866 0.001 10.24 0.01
5 2021-03-31 18:30:00 1 NA NA 7.863 0.002 10.27 0.007
6 2021-03-30 11:25:00 1 18.29 0.018 7.864 0.001 10.3 0.006
7 2021-03-30 11:30:00 1 18.2 0.026 7.865 0.002 10.32 0.006
8 2021-03-30 11:35:00 1 18.12 0.025 7.867 0.001 10.34 0.008
9 2021-03-31 17:00:00 1 18.03 0.024 NA NA 10.37 0.008
10 2021-03-31 19:28:00 1 NA NA NA NA 10.4 0.005
My data is grouped in month-year format under names of corresponding months. But I want to group them under their corresponding years.
I have a list like below: values are grouped under same months.
$Apr
$Apr$`04-2036`
date value
116 04-25-2036 1.14
117 04-26-2036 0.67
$Apr$`04-2037`
date value
478 04-22-2037 0
479 04-23-2037 0
$Mar
$Mar$`03-2037`
date value
446 03-21-2037 1.67
447 03-22-2037 0.00
$May
$May$`05-2036`
date value
146 05-25-2036 0.00
147 05-26-2036 2.31
And here is its structure:
sample<-structure(list(Apr = structure(list(`04-2036` = structure(list(
date = c("04-25-2036", "04-26-2036"), value = c(1.14, 0.67
)), .Names = c("date", "value"), row.names = 116:117, class = "data.frame"),
`04-2037` = structure(list(date = c("04-22-2037", "04-23-2037"
), value = c(0, 0)), .Names = c("date", "value"), row.names = 478:479, class = "data.frame")), .Names = c("04-2036",
"04-2037")), Mar = structure(list(`03-2037` = structure(list(
date = c("03-21-2037", "03-22-2037"), value = c(1.67, 0)), .Names = c("date",
"value"), row.names = 446:447, class = "data.frame")), .Names = "03-2037"),
May = structure(list(`05-2036` = structure(list(date = c("05-25-2036",
"05-26-2036"), value = c(0, 2.31)), .Names = c("date", "value"
), row.names = 146:147, class = "data.frame")), .Names = "05-2036")), .Names = c("Apr",
"Mar", "May"))
Desired output: data will be grouped under same years.
$`2036`
$`2036`$`04-2036`
date value
116 04-25-2036 1.14
117 04-26-2036 0.67
$`2036`$`05-2036`
date value
146 05-25-2036 0.00
147 05-26-2036 2.31
$`2037`
$`2037`$`03-2037`
date value
446 03-21-2037 1.67
447 03-22-2037 0.00
$`2037`$`04-2037`
date value
478 04-22-2037 0
479 04-23-2037 0
structure of output will be like this one:
output<-structure(list(`2036` = structure(list(`04-2036` = structure(list(
date = c("04-25-2036", "04-26-2036"), value = c(1.14, 0.67
)), .Names = c("date", "value"), row.names = 116:117, class = "data.frame"),
`05-2036` = structure(list(date = c("05-25-2036", "05-26-2036"
), value = c(0, 2.31)), .Names = c("date", "value"), row.names = 146:147, class = "data.frame")), .Names = c("04-2036",
"05-2036")), `2037` = structure(list(`03-2037` = structure(list(
date = c("03-21-2037", "03-22-2037"), value = c(1.67, 0)), .Names = c("date",
"value"), row.names = 446:447, class = "data.frame"), `04-2037` = structure(list(
date = c("04-22-2037", "04-23-2037"), value = c(0, 0)), .Names = c("date",
"value"), row.names = 478:479, class = "data.frame")), .Names = c("03-2037",
"04-2037"))), .Names = c("2036", "2037"))
You can combine the data into one dataframe, extract the year from it and split.
library(dplyr)
library(purrr)
map_df(sample, bind_rows, .id = 'month') %>%
mutate(date = mdy(date),
year = year(date)) %>%
split(.$year) %>%
map(~split(.x, .x$month))
#$`2036`
#$`2036`$Apr
# month date value year
#1 Apr 2036-04-25 1.14 2036
#2 Apr 2036-04-26 0.67 2036
#$`2036`$May
# month date value year
#7 May 2036-05-25 0.00 2036
#8 May 2036-05-26 2.31 2036
#$`2037`
#$`2037`$Apr
# month date value year
#3 Apr 2037-04-22 0 2037
#4 Apr 2037-04-23 0 2037
#$`2037`$Mar
# month date value year
#5 Mar 2037-03-21 1.67 2037
#6 Mar 2037-03-22 0.00 2037
We can use base R
df1 <- transform(do.call(rbind, Map(cbind, Month = names(sample),
lapply(sample, function(x) do.call(rbind, x)))), date = as.Date(date, "%m-%d-%Y"))
df1$year <- format(df1$date, "%Y")
row.names(df1) <- NULL
lapply(split(df1, df1$year), function(x) split(x, x$Month))
$`2036`
$`2036`$Apr
Month date value year
1 Apr 2036-04-25 1.14 2036
2 Apr 2036-04-26 0.67 2036
$`2036`$May
Month date value year
7 May 2036-05-25 0.00 2036
8 May 2036-05-26 2.31 2036
$`2037`
$`2037`$Apr
Month date value year
3 Apr 2037-04-22 0 2037
4 Apr 2037-04-23 0 2037
$`2037`$Mar
Month date value year
5 Mar 2037-03-21 1.67 2037
6 Mar 2037-03-22 0.00 2037
I have data of patient prescription of oral DM drugs, i.e. DPP4 and SU, and would like to find out if patients had taken the drugs concurrently (i.e. whether there are overlapping intervals for DPP4 and SU within the same patient ID).
Sample data:
ID DRUG START END
1 1 DPP4 2020-01-01 2020-01-20
2 1 DPP4 2020-03-01 2020-04-01
3 1 SU 2020-03-15 2020-04-30
4 2 SU 2020-10-01 2020-10-31
5 2 DPP4 2020-12-01 2020-12-31
In the sample data above,
ID == 1, patient had DPP4 and SU concurrently from 2020-03-15 to 2020-04-01.
ID == 2, patient had consumed both medications at separate intervals.
I thought of splitting the data into 2, one for DPP4 and another for SU. Then, do a full join, and compare each DPP4 interval with each SU interval. This may be okay for small data, but if a patient has like 5 rows for DPP4 and another 5 for SU, we will have 25 comparisons, which may not be efficient. Add that with 10000+ patients.
I am not sure how to do it.
New data:
Hope to have a new df that looks like this. Or anything that is tidy.
ID DRUG START END
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
Data Code:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L), DRUG = c("DPP4", "DPP4",
"SU", "SU", "DPP4"), START = structure(c(18262, 18322, 18336,
18536, 18597), class = "Date"), END = structure(c(18281, 18353,
18382, 18566, 18627), class = "Date")), class = "data.frame", row.names = c(NA,
-5L))
df_new <- structure(list(ID = 1:2, DRUG = c("DPP4-SU", NA), START = structure(c(18336,
NA), class = "Date"), END = structure(c(18353, NA), class = "Date")), class = "data.frame", row.names = c(NA,
-2L))
Edit:
I think from the sample data I gave, it may seem that there can only be 1 intersecting interval. But there may be more. So, I think this would be better data to illustrate.
structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004,
17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate",
"Date")), END = structure(c(17039, 17405, 17405, 17521, 17625,
17669, 17711), class = c("IDate", "Date")), duration = c(35L,
22L, 22L, 103L, 188L, 20L, 35L), INDEX = c(1L, 0L, 0L, 0L, 0L,
0L, 0L)), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"))
It's way more complicated than dear #AnoushiravanR's but as an alternative you could try
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
full_join(x = ., y = ., by = "ID") %>%
# filter(DRUG.x != DRUG.y | START.x != START.y | END.x != END.y) %>%
filter(DRUG.x != DRUG.y) %>%
group_by(ID, intersection = intersect(interval(START.x, END.x), interval(START.y, END.y))) %>%
drop_na(intersection) %>%
filter(START.x == first(START.x)) %>%
summarise(DRUG = paste(DRUG.x, DRUG.y, sep = "-"),
START = as_date(int_start(intersection)),
END = as_date(int_end(intersection)),
.groups = "drop") %>%
select(-intersection)
returning
# A tibble: 1 x 4
ID DRUG START END
<int> <chr> <date> <date>
1 1 DPP4-SU 2020-03-15 2020-04-01
Edit: Changed the filter condition. The former one was flawed.
Updated Solution
I have made considerable modifications based on the newly provided data set. This time I first created interval for each START and END pair and extract the intersecting period between them. As dear Martin nicely made use of them we could use lubridate::int_start and lubridate::int_end to extract the START and END date of each interval:
library(dplyr)
library(lubridate)
library(purrr)
library(tidyr)
df %>%
group_by(ID) %>%
arrange(START, END) %>%
mutate(int = interval(START, END),
is_over = c(NA, map2(int[-n()], int[-1],
~ intersect(.x, .y)))) %>%
unnest(cols = c(is_over)) %>%
select(-int) %>%
filter(!is.na(is_over) | !is.na(lead(is_over))) %>%
select(!c(START, END)) %>%
mutate(grp = cumsum(is.na(is_over))) %>%
group_by(grp) %>%
summarise(ID = first(ID),
DRUG = paste0(DRUG, collapse = "-"),
is_over = na.omit(is_over)) %>%
mutate(START = int_start(is_over),
END = int_end(is_over)) %>%
select(!is_over)
# A tibble: 1 x 5
grp ID DRUG START END
<int> <int> <chr> <dttm> <dttm>
1 1 1 DPP4-SU 2020-03-15 00:00:00 2020-04-01 00:00:00
Second data set:
# A tibble: 2 x 5
grp ID DRUG START END
<int> <dbl> <chr> <dttm> <dttm>
1 1 3 DPP4-SU 2017-08-05 00:00:00 2017-08-27 00:00:00
2 2 3 SU-DPP4 2017-09-28 00:00:00 2017-12-21 00:00:00
Update
As per updated df
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c(
"DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"
), START = structure(c(
17004,
17383, 17383, 17418, 17437, 17649, 17676
), class = c(
"IDate",
"Date"
)), END = structure(c(
17039, 17405, 17405, 17521, 17625,
17669, 17711
), class = c("IDate", "Date")), duration = c(
35L,
22L, 22L, 103L, 188L, 20L, 35L
), INDEX = c(
1L, 0L, 0L, 0L, 0L,
0L, 0L
)), row.names = c(NA, -7L), class = c(
"tbl_df", "tbl",
"data.frame"
))
we obtain
> dfnew
ID DRUG start end
3.3 3 DPP4-SU 2017-08-05 2017-08-27
3.7 3 SU-DPP4 2017-09-28 2017-12-21
A base R option (not as fancy as the answers by #Anoushiravan R or #Martin Gal)
f <- function(d) {
d <- d[with(d, order(START, END)), ]
idx <- subset(
data.frame(which((u <- with(d, outer(START, END, `<`))) & t(u), arr.ind = TRUE)),
row > col
)
if (nrow(idx) == 0) {
return(data.frame(ID = unique(d$ID), DRUG = NA, start = NA, end = NA))
}
with(
d,
do.call(rbind,
apply(
idx,
1,
FUN = function(v) {
data.frame(
ID = ID[v["row"]],
DRUG = paste0(DRUG[sort(unlist(v))], collapse = "-"),
start = START[v["row"]],
end = END[v["col"]]
)
}
))
)
}
dfnew <- do.call(rbind, Map(f, split(df, ~ID)))
gives
> dfnew
ID DRUG start end
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
You may use a slightly different approach from the above answers, but this will give you results in format different than required. Obviously, these can be joined to get expected results. You may try this
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df
#> # A tibble: 7 x 4
#> ID DRUG START END
#> <dbl> <chr> <date> <date>
#> 1 3 DPP4 2016-07-22 2016-08-26
#> 2 3 DPP4 2017-08-05 2017-08-27
#> 3 3 SU 2017-08-05 2017-08-27
#> 4 3 SU 2017-09-09 2017-12-21
#> 5 3 DPP4 2017-09-28 2018-04-04
#> 6 3 DPP4 2018-04-28 2018-05-18
#> 7 3 DPP4 2018-05-25 2018-06-29
library(tidyverse)
df %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END'))
#> # A tibble: 4 x 6
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap
#> <dbl> <chr> <int> <ord> <date> <dbl>
#> 1 3 SU 3 START 2017-08-05 2
#> 2 3 DPP4 2 END 2017-08-27 1
#> 3 3 DPP4 5 START 2017-09-28 2
#> 4 3 SU 4 END 2017-12-21 1
on originally provided data
# A tibble: 2 x 6
# Groups: ID [1]
ID DRUG treatment_id event dates overlap
<int> <chr> <int> <ord> <date> <dbl>
1 1 SU 3 START 2020-03-15 2
2 1 DPP4 2 END 2020-04-01 1
For transforming/getting results in original shape, you may filter overlapping rows
library(tidyverse)
df_new <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df_new %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END')) %>%
left_join(df_new %>% mutate(treatment_id = row_number()), by = c('ID', 'DRUG', 'treatment_id'))
#> # A tibble: 4 x 8
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap START END
#> <dbl> <chr> <int> <ord> <date> <dbl> <date> <date>
#> 1 3 SU 3 START 2017-08-05 2 2017-08-05 2017-08-27
#> 2 3 DPP4 2 END 2017-08-27 1 2017-08-05 2017-08-27
#> 3 3 DPP4 5 START 2017-09-28 2 2017-09-28 2018-04-04
#> 4 3 SU 4 END 2017-12-21 1 2017-09-09 2017-12-21
Created on 2021-08-10 by the reprex package (v2.0.0)
I have a count of stems by tree species for different plots.
structure(list(Plot = c(1, 2), Pine = c(0, 430), Spruce = c(708,
1241), Birch = c(119, 48), Aspen = c(0, 0), Salix = c(0, 0),
Rowan = c(0, 0), Alnus = c(0, 0), stem_sum = c(827, 1719)), row.names = c(NA,
-2L), groups = structure(list(.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
What I want to do is use dplyr 1.0 convention to mutate a new column, "Main species", if any of the tree species columns exceed 80% of the stem_sum of that plot.
My thought process:
df %>% rowwise() %>% mutate(`Main species`= c_across(Pine:Alnus, if(.. / stem/sum >= 0.8, paste(...))
How can I modify this code such that if there are more than one column which fulfils the requirement, the output will be "Mixed"?
You can use :
library(dplyr)
df %>%
rowwise() %>%
mutate(Main_Species = if(any(c_across(Pine:Alnus) >= 0.8 * stem_sum))
'Mixed' else 'Not Mixed')
# Plot Pine Spruce Birch Aspen Salix Rowan Alnus stem_sum Main_Species
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 708 119 0 0 0 0 827 Mixed
#2 2 430 1241 48 0 0 0 0 1719 Not Mixed
Or in base R :
df$Main_species <- ifelse(rowSums(df[2:8] >= df$stem_sum * 0.8) > 0,
'Mixed', 'Not Mixed')
library(tidyverse)
df %>%
pivot_longer(-c(Plot, stem_sum)) %>%
arrange(Plot, desc(value)) %>%
group_by(Plot) %>%
mutate(pct = value/stem_sum,
main_species = case_when(
pct > 0.8 & pct == max(pct) ~ name,
pct == max(pct) ~ "mixed"
)
) %>% ungroup() %>%
fill(main_species, .direction = "down") %>%
select(-pct) %>%
pivot_wider()
I need to split 10/4/2018 19:21
and have tried
AccidentsMp$Hours <- format(as.POSIXct(AccidentsMp$Job.Date, "%Y-%m-%d %H:%M:%S", tz = ""), format = "%H:%M")
AccidentsMp$Dates <- format(as.Date(AccidentsMp$Job.Date,"%Y-%m-%d"), format = "%d/%m/%Y")
How can I split the above date and time it into two columns?The data is of class factor now.
Here is an option with tidyverse
library(tidyverse)
library(lubridate)
df %>%
mutate(Job.Date = dmy_hm(Job.Date)) %>%
separate(Job.Date, into = c('date', 'time'), sep=' ', remove = FALSE)
# Job.Date date time
#1 2018-04-10 19:21:00 2018-04-10 19:21:00
#2 2018-04-10 19:22:00 2018-04-10 19:22:00
#3 2018-04-10 19:23:00 2018-04-10 19:23:00
Or using base R
read.table(text = as.character(df$Job.Date), header = FALSE,
col.names = c("date", "time"))
data
df <- structure(list(Job.Date = structure(1:3, .Label = c("10/4/2018 19:21",
"10/4/2018 19:22", "10/4/2018 19:23"), class = "factor")),
class = "data.frame", row.names = c(NA, -3L))
If your data follows the same format as shown we can do it as follows using only base R
df$datetime <- as.POSIXct(df$Job.Date, format = "%d/%m/%Y %H:%M")
transform(df, time = format(datetime, "%T"), date = format(datetime, "%d/%m/%Y"))
# Job.Date datetime time date
#1 10/4/2018 19:21 2018-04-10 19:21:00 19:21:00 10/04/2018
#2 10/4/2018 19:22 2018-04-10 19:22:00 19:22:00 10/04/2018
#3 10/4/2018 19:23 2018-04-10 19:23:00 19:23:00 10/04/2018
You can remove the datetime column later if not needed.
data
df <- data.frame(Job.Date = c("10/4/2018 19:21", "10/4/2018 19:22",
"10/4/2018 19:23"))
If one needs to do it the hard way:
text<-"10/4/2018 19:21"
res<-strsplit(text," ")
df$Date<-res[[1]][1]
df$Time<-res[[1]][2]
#install.packages("lubridate")
df$Date<-lubridate::mdy(df$Date)
df$Time<-lubridate::hm(df$Time)
You can get the time and date without using any packages as:
df$Time<-format(strptime(res[[1]][2],"%H:%M",tz=""),"%H:%M") #cleaner output
df$Date<- as.Date(res[[1]][1],"%m/%d/%Y")
Result using lubridate:
month item sales Date Time
1 1 A 10 2018-10-04 19H 21M 0S
2 2 b 20 2018-10-04 19H 21M 0S
3 2 c 5 2018-10-04 19H 21M 0S
4 3 a 3 2018-10-04 19H 21M 0S
Data
df<-structure(list(month = c(1, 2, 2, 3), item = structure(c(2L,
3L, 4L, 1L), .Label = c("a", "A", "b", "c"), class = "factor"),
Time = new("Period", .Data = c(0, 0, 0, 0), year = c(0, 0,
0, 0), month = c(0, 0, 0, 0), day = c(0, 0, 0, 0), hour = c(19,
19, 19, 19), minute = c(21, 21, 21, 21))), class = "data.frame", row.names = c(NA,
-4L))