tidyverse gather multiple columns - r

I have the following data frame:
df <- structure(list(ID = 1:4, col1.date = structure(c(1546188000,
1272294300, 1087908540, 1512241620), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), col2.date = structure(c(1546237740, 1272928800,
1087966800, 1512277200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
col3.date = structure(c(1546323000, 1272949200, 1088049600,
1512396000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
col1.result = c(1.31, 0.95, 3.3, 0.55), col2.result = c(1.19,
1.57, 1.6, 0.59), col3.result = c(0.97, 2.13, 1.1, 0.57)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I would like to have for each ID three rows and two columns: result and date.
This is what I have tried:
df_long <- df %>%
gather(v, value, col1.date:col3.result) %>%
separate(v, c("var", "col")
however I am getting the date transformed to numeric.
What am I doing wrong?

Since you ultimately want to reshape multiple columns (and it is the "new way" of tidyr-1.0.0), then try pivot_longer. This answer is adapted directly from the example in the help page at ?pivot_longer:
df %>%
pivot_longer(
col1.date:col3.result,
names_to = c("set", ".value"),
names_pattern = "(.*)\\.(.*)"
)
# # A tibble: 12 x 4
# ID set date result
# <int> <chr> <dttm> <dbl>
# 1 1 col1 2018-12-30 16:40:00 1.31
# 2 1 col2 2018-12-31 06:29:00 1.19
# 3 1 col3 2019-01-01 06:10:00 0.97
# 4 2 col1 2010-04-26 15:05:00 0.95
# 5 2 col2 2010-05-03 23:20:00 1.57
# 6 2 col3 2010-05-04 05:00:00 2.13
# 7 3 col1 2004-06-22 12:49:00 3.3
# 8 3 col2 2004-06-23 05:00:00 1.6
# 9 3 col3 2004-06-24 04:00:00 1.1
# 10 4 col1 2017-12-02 19:07:00 0.55
# 11 4 col2 2017-12-03 05:00:00 0.59
# 12 4 col3 2017-12-04 14:00:00 0.570

Related

how to filter all the elements of the list in R? but the filter will use the mean of each element so the condition will change for each element

I have a dataset with temperature data for each day, so i grouped them by date. In the end i have a list with dataframes for each day. Now what i want to do is i want to filter by a range all these dataframes. the filter is the mean value of temperature for that day(dataframe) +- 0.5°C.
But the problem is that each dataframe in the list has a different mean value (I hope im clear).
So i want to filter by the mean values of a column but this mean changes for every dataframe.
How can i solve this problem.
I'm an amateur in R so anything is helpful. Thank you in advance
This is a short version of the my list
structure(list(structure(list(Date = structure(c(1646434800,
1646434800, 1646434800, 1646434800, 1646434800, 1646434800, 1646434800,
1646434800, 1646434800, 1646434800), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(0.875, 0.5, 0.1875, -0.1875, -0.5, -0.8125,
-1.125, -1.375, -1.625, -1.875)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Date = structure(c(1646521200,
1646521200, 1646521200, 1646521200, 1646521200, 1646521200, 1646521200,
1646521200, 1646521200, 1646521200, 1646521200), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(3.75, 3.75, 3.6875, 3.6875, 3.6875, 3.6875,
3.6875, 3.625, 3.625, 3.625, 3.625)), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Date = structure(c(1646607600,
1646607600, 1646607600, 1646607600, 1646607600, 1646607600, 1646607600,
1646607600, 1646607600, 1646607600, 1646607600), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(3.6875, 3.6875, 3.6875, 3.6875, 3.6875, 3.625,
3.625, 3.625, 3.625, 3.625, 3.625)), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame"))), ptype = structure(list(Date = structure(numeric(0), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = numeric(0)), class = c("tbl_df", "tbl", "data.frame"
), row.names = integer(0)), class = c("vctrs_list_of", "vctrs_vctr",
"list"))
You can do this in several ways. Suppose mydata is the list that you provided in the question.
In dplyr you can bind the rows of all the data frames in mydata first to create a single data frame, and then group them by the Date, and then apply the filter to each group. The result is a data frame.
do.call(rbind, mydata) %>%
group_by(Date) %>% filter((V4 <= mean(V4) + 0.5) &
(V4 >= mean(V4)-0.5))
# A tibble: 25 x 2
# Groups: Date [3]
# Date V4
# <dttm> <dbl>
# 1 2022-03-05 06:00:00 -0.188
# 2 2022-03-05 06:00:00 -0.5
# 3 2022-03-05 06:00:00 -0.812
# 4 2022-03-06 06:00:00 3.75
# 5 2022-03-06 06:00:00 3.75
# 6 2022-03-06 06:00:00 3.69
# 7 2022-03-06 06:00:00 3.69
# 8 2022-03-06 06:00:00 3.69
# 9 2022-03-06 06:00:00 3.69
# 10 2022-03-06 06:00:00 3.69
# ... with 15 more rows
In R base you can define your function that filters a single data frame, and then apply the function to mydata. The result is a list of data frames.
myfilter <- function(df) {
cond <- (df$V4 <= mean(df$V4 + 0.5) & (df$V4 >= mean(df$V4) - 0.5))
result <- df[cond,]
return(result)
}
lapply(mydata, myfilter)
# [[1]]
# # A tibble: 3 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-05 06:00:00 -0.188
# 2 2022-03-05 06:00:00 -0.5
# 3 2022-03-05 06:00:00 -0.812
#
# [[2]]
# # A tibble: 11 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-06 06:00:00 3.75
# 2 2022-03-06 06:00:00 3.75
# 3 2022-03-06 06:00:00 3.69
# 4 2022-03-06 06:00:00 3.69
# 5 2022-03-06 06:00:00 3.69
# 6 2022-03-06 06:00:00 3.69
# 7 2022-03-06 06:00:00 3.69
# 8 2022-03-06 06:00:00 3.62
# 9 2022-03-06 06:00:00 3.62
# 10 2022-03-06 06:00:00 3.62
# 11 2022-03-06 06:00:00 3.62
#
# [[3]]
# # A tibble: 11 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-07 06:00:00 3.69
# 2 2022-03-07 06:00:00 3.69
# 3 2022-03-07 06:00:00 3.69
# 4 2022-03-07 06:00:00 3.69
# 5 2022-03-07 06:00:00 3.69
# 6 2022-03-07 06:00:00 3.62
# 7 2022-03-07 06:00:00 3.62
# 8 2022-03-07 06:00:00 3.62
# 9 2022-03-07 06:00:00 3.62
# 10 2022-03-07 06:00:00 3.62
# 11 2022-03-07 06:00:00 3.62

Sum cumulative time between changes in a single status variable in R

I've been looking for answers and messing around with my code for a couple hours. I have a dataset that looks like the following for a specific ID:
# A tibble: 14 × 3
ID state orderDate
<dbl> <chr> <dttm>
1 4227631 1 2022-03-14 19:00:00
2 4227631 1 2022-03-14 20:00:00
3 4227631 1 2022-03-15 11:00:00
4 4227631 0 2022-03-15 11:00:00
5 4227631 1 2022-03-15 20:00:00
6 4227631 1 2022-03-16 04:00:00
7 4227631 0 2022-03-16 04:00:00
8 4227631 1 2022-03-16 05:00:00
9 4227631 0 2022-03-16 13:00:00
10 4227631 1 2022-03-16 15:00:00
This occurs for hundreds of IDs. For this example, I am using dplyr to group_by ID. I only care when status changes between values, not if it stays the same.
I want to calculate the cumulative time each ID remains in status 1. The instances where status 1 is repeated multiple times before it changes should be ignored. I have been planning to use lubridate and dplyr to perform the analysis.
Tibble I am using for this example:
structure(list(ID = c(4227631, 4227631, 4227631, 4227631, 4227631,
4227631, 4227631, 4227631, 4227631, 4227631), state = c("1",
"1", "1", "0", "1", "1", "0", "1", "0", "1"), orderDate = structure(c(1647284400,
1647288000, 1647342000, 1647342000, 1647374400, 1647403200, 1647403200,
1647406800, 1647435600, 1647442800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I've tried various solutions such as Cumulative time with reset however I'm having trouble with lag and incorporating it into this specific analysis.
The expected output would maybe look something like this:
And then I would plan to sum all statusOne together to figure out cumulative time spent in this state.
Invite all more elegant solutions or if someone has a link to a prior question.
EDIT
Using solution below I figured it out!
The solution didn't look at the situations where state 0 immediately followed state 1 and we wanted to look at the total time elapsed between these states.
df %>%
group_by(ID) %>%
mutate(max = cumsum(ifelse(orderName == lag(orderName, default = "1"), 0, 1))) %>%
mutate(hours1 = ifelse(max == lag(max) &
orderName=="1", difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
mutate(hours2 = ifelse(orderName=="0" & lag(orderName)=="1",
difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
mutate(hours1 = replace_na(hours1, 0),
hours2 = replace_na(hours2, 0)) %>%
mutate(hours = hours1+hours2) %>%
select(-hours1, -hours2) %>%
summarise(total_hours = sum(hours, na.rm = TRUE)) %>%
filter(total_hours!=0)
This is far from elegant, but at least it appears to provide the correct answer:
library(tidyverse)
df <- structure(list(ID = c(4227631, 4227631, 4227631, 4227631, 4227631,
4227631, 4227631, 4227631, 4227631, 4227631),
state = c("1", "1", "1", "0", "1", "1", "0", "1", "0", "1"),
orderDate = structure(c(1647284400, 1647288000, 1647342000,
1647342000, 1647374400, 1647403200,
1647403200, 1647406800, 1647435600,
1647442800),
tzone = "UTC",
class = c("POSIXct", "POSIXt"))),
row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"))
df2 <- df %>%
group_by(ID) %>%
mutate(tmp = ifelse(state == lag(state, default = "1"), 0, 1),
max = cumsum(tmp)) %>%
mutate(hours = ifelse(max == lag(max), difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
select(-tmp)
df3 <- df2 %>%
group_by(max) %>%
summarise(max, statusOne = sum(hours, na.rm = TRUE))
df4 <- left_join(df2, df3, by = "max") %>%
distinct() %>%
select(-c(max, hours)) %>%
mutate(statusOne = ifelse(statusOne != 0 & lag(statusOne, default = 1) == statusOne, 0, statusOne))
df4
#> # A tibble: 10 × 4
#> # Groups: ID [1]
#> ID state orderDate statusOne
#> <dbl> <chr> <dttm> <dbl>
#> 1 4227631 1 2022-03-14 19:00:00 16
#> 2 4227631 1 2022-03-14 20:00:00 0
#> 3 4227631 1 2022-03-15 11:00:00 0
#> 4 4227631 0 2022-03-15 11:00:00 0
#> 5 4227631 1 2022-03-15 20:00:00 8
#> 6 4227631 1 2022-03-16 04:00:00 0
#> 7 4227631 0 2022-03-16 04:00:00 0
#> 8 4227631 1 2022-03-16 05:00:00 0
#> 9 4227631 0 2022-03-16 13:00:00 0
#> 10 4227631 1 2022-03-16 15:00:00 0
Created on 2022-04-04 by the reprex package (v2.0.1)
Edit
It's a lot more straightforward to get the total_hours state=1 for each ID:
df %>%
group_by(ID) %>%
mutate(max = cumsum(ifelse(state == lag(state, default = "1"), 0, 1))) %>%
mutate(hours = ifelse(max == lag(max), difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
summarise(total_hours = sum(hours, na.rm = TRUE))
#> # A tibble: 1 × 2
#> ID total_hours
#> <dbl> <dbl>
#> 1 4227631 24
Created on 2022-04-04 by the reprex package (v2.0.1)

Adjust date format after using dput in R

As you can see, it misconfigured the date in my database. How can I fix this or is it that way??
library(readxl)
df<-read_excel('C:/Desktop/example.xlsx')
dput(df)
> dput(df)
structure(list(Date = structure(c(1629936000, 1629936000, 1629936000,
1629936000, 1629936000, 1629936000), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Upd = structure(c(1577836800, 1577836800,
1577836800, 1577836800, 1580601600, 1580601600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Value 1` = c(12, 12, 3, 4, 5, 6)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could reformat the data:
library(dplyr)
df %>%
mutate(across(c(Date, Upd), ~format(.x, "%d/%m/%Y")))
This returns
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
Or with base R
df$Date <- format(df$Date, "%d/%m/%Y")
df$Upd <- format(df$Upd, "%d/%m/%Y")
In base R
df[c("Date", "Upd")] <- lapply(df[c("Date", "Upd")], format, "%d/%m/%Y")
-output
df
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
I'm not sure if this is exactly what you're going after, but I deal a lot with excel files where a column that should be a date get's imported as numeric. I made this little function to help me out with that:
excel_date_number_to_date <- function(number) {
as.Date(number, origin = "1899-12-30")
}
For your dataframe you can use it like this:
df %>%
mutate(across(c(Date, Upd), ~excel_date_number_to_date(.x)))

Merge two dataframes: specifically merge a selection of columns based on two conditions?

I have two datasets on the same 2 patients. With the second dataset I want to add new information to the first, but I can't seem to get the code right.
My first (incomplete) dataset has a patient ID, measurement time (either T0 or FU1), year of birth, date of the CT scan, and two outcomes (legs_mass and total_mass):
library(tidyverse)
library(dplyr)
library(magrittr)
library(lubridate)
df1 <- structure(list(ID = c(115, 115, 370, 370), time = structure(c(1L,
6L, 1L, 6L), .Label = c("T0", "T1M0", "T1M6", "T1M12", "T2M0",
"FU1"), class = "factor"), year_of_birth = c(1970, 1970, 1961,
1961), date_ct = structure(c(16651, 17842, 16651, 18535), class = "Date"),
legs_mass = c(9.1, NA, NA, NA), total_mass = c(14.5, NA,
NA, NA)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
# Which gives the following dataframe
df1
# A tibble: 4 x 6
ID time year_of_birth date_ct legs_mass total_mass
<dbl> <fct> <dbl> <date> <dbl> <dbl>
1 115 T0 1970 2015-08-04 9.1 14.5
2 115 FU1 1970 2018-11-07 NA NA
3 370 T0 1961 2015-08-04 NA NA
4 370 FU1 1961 2020-09-30 NA NA
The second dataset adds to the legs_mass and total_mass columns:
df2 <- structure(list(ID = c(115, 370), date_ct = structure(c(17842,
18535), class = "Date"), ctscan_label = c("PXE115_CT_20181107_xxxxx-3.tif",
"PXE370_CT_20200930_xxxxx-403.tif"), legs_mass = c(956.1, 21.3
), total_mass = c(1015.9, 21.3)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
# Which gives the following dataframe:
df2
# A tibble: 2 x 5
ID date_ct ctscan_label legs_mass total_mass
<dbl> <date> <chr> <dbl> <dbl>
1 115 2018-11-07 PXE115_CT_20181107_xxxxx-3.tif 956. 1016.
2 370 2020-09-30 PXE370_CT_20200930_xxxxx-403.tif 21.3 21.3
What I am trying to do, is...
Add the legs_mass and total_mass column values from df2 to df1, based on ID number and date_ct.
Add the new columns of df2 (the one that is not in df1; ctscan_label) to df1, also based on the date of the ct and patient ID.
So that the final dataset df3 looks as follows:
df3 <- structure(list(ID = c(115, 115, 370, 370), time = structure(c(1L,
6L, 1L, 6L), .Label = c("T0", "T1M0", "T1M6", "T1M12", "T2M0",
"FU1"), class = "factor"), year_of_birth = c(1970, 1970, 1961,
1961), date_ct = structure(c(16651, 17842, 16651, 18535), class = "Date"),
legs_mass = c(9.1, 956.1, NA, 21.3), total_mass = c(14.5,
1015.9, NA, 21.3)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
# Corresponding to the following tibble:
# A tibble: 4 x 6
ID time year_of_birth date_ct legs_mass total_mass
<dbl> <fct> <dbl> <date> <dbl> <dbl>
1 115 T0 1970 2015-08-04 9.1 14.5
2 115 FU1 1970 2018-11-07 956. 1016.
3 370 T0 1961 2015-08-04 NA NA
4 370 FU1 1961 2020-09-30 21.3 21.3
I have tried the merge function and rbind from baseR, and bind_rows from dplyr but can't seem to get it right.
Any help?
You can join the two datasets and use coalesce to keep one non-NA value from the two datasets.
library(dplyr)
left_join(df1, df2, by = c("ID", "date_ct")) %>%
mutate(leg_mass = coalesce(legs_mass.x , legs_mass.y),
total_mass = coalesce(total_mass.x, total_mass.y)) %>%
select(-matches('\\.x|\\.y'), -ctscan_label)
# ID time year_of_birth date_ct leg_mass total_mass
# <dbl> <fct> <dbl> <date> <dbl> <dbl>
#1 115 T0 1970 2015-08-04 9.1 14.5
#2 115 FU1 1970 2018-11-07 956. 1016.
#3 370 T0 1961 2015-08-04 NA NA
#4 370 FU1 1961 2020-09-30 21.3 21.3
We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), c("legs_mass", "total_mass") :=
.(fcoalesce(legs_mass, i.legs_mass),
fcoalesce(total_mass, i.total_mass)), on = .(ID, date_ct)]
-output
df1
ID time year_of_birth date_ct legs_mass total_mass
1: 115 T0 1970 2015-08-04 9.1 14.5
2: 115 FU1 1970 2018-11-07 956.1 1015.9
3: 370 T0 1961 2015-08-04 NA NA
4: 370 FU1 1961 2020-09-30 21.3 21.3

calculate time difference in the same group

I want to convert the column time to be in time decimal format and then find the time interval within each group of the user_id. I have tried the answer below, but I could not get it to work:
Days difference between two dates in same column in R
structure(list(question_id = c(5502L, 5502L, 5502L, 5502L, 5502L
), user_id = c(112197L, 112197L, 112197L, 114033L, 114033L),
time = structure(c(1603720173, 1603720388, 1603720702, 1603603115,
1603949442), class = c("POSIXct", "POSIXt"), tzone = ""),
prediction = c(0.9, 0.95, 0.9, 0.99, 0.94), log_score = c(0.84799690655495,
0.925999418556223, 0.84799690655495, 0.985500430304885, 0.910732661902913
)), row.names = 156182:156186, class = "data.frame")
Perhaps this is what you're looking for?
library(dplyr)
user_data %>%
group_by(user_id) %>%
summarise(day.interval = difftime(max(time), min(time),units = "days"))
# A tibble: 2 x 2
user_id day.interval
<int> <drtn>
1 112197 0.006122685 days
2 114033 4.008414352 days
library(tidyverse)
library(lubridate)
df <- tibble::tribble(
~question_id, ~user_id, ~time, ~prediction, ~log_score,
5502L, 112197L, "2020-10-26 14:49:33", 0.9, 0.84799690655495,
5502L, 112197L, "2020-10-26 14:53:08", 0.95, 0.925999418556223,
5502L, 112197L, "2020-10-26 14:58:22", 0.9, 0.84799690655495,
5502L, 114033L, "2020-10-25 06:18:35", 0.99, 0.985500430304885,
5502L, 114033L, "2020-10-29 06:30:42", 0.94, 0.910732661902913
)
df %>%
as_tibble() %>%
mutate(time = lubridate::ymd_hms(time)) %>%
group_by(user_id) %>%
mutate(diff = time - lag(time),
diff2 = hms::hms(seconds_to_period(diff)))
#> # A tibble: 5 x 7
#> # Groups: user_id [2]
#> question_id user_id time prediction log_score diff diff2
#> <int> <int> <dttm> <dbl> <dbl> <drtn> <time>
#> 1 5502 112197 2020-10-26 14:49:33 0.9 0.848 NA secs NA
#> 2 5502 112197 2020-10-26 14:53:08 0.95 0.926 215 secs 00:03:35
#> 3 5502 112197 2020-10-26 14:58:22 0.9 0.848 314 secs 00:05:14
#> 4 5502 114033 2020-10-25 06:18:35 0.99 0.986 NA secs NA
#> 5 5502 114033 2020-10-29 06:30:42 0.94 0.911 346327 secs 96:12:07

Resources