calculate column percentage difference - r

My df have more than 2 columns (for example, 4). I want to calculate the percentage difference of the column 2-4 compared to column 1.
time1 time2 time3 time4
1 5 1 10
2 2 2 4
3 6 3 12
My code is:
timepoint <- colnames(df)[2:4]
for (x in timepoint){
df$x <- 100*(df$x/df$time1-1)
}
What's wrong with this for function? Thank you!

Maybe this is a too long answer, but I think it's a start step
library(dplyr)
library(tibble)
library(tidyr)
df <- tribble(
~time1, ~time2, ~time3, ~time4,
1, 5, 1, 10,
2, 2, 2, 4,
3, 6, 3, 12
)
df_tidy <- df %>%
mutate(id = 1:nrow(.)) %>%
gather(time, value, time1:time4) %>%
mutate(id_base = ifelse(time == "time1", TRUE, FALSE))
df_calc <- filter(df_tidy, id_base == FALSE)
df_base <- df_tidy %>%
filter(id_base== TRUE) %>%
select(id, value_base = value)
df_join <- df_calc %>%
left_join(
df_base,
by = "id"
)
df_join %>%
mutate(diff = (value / value_base) * 100)
# A tibble: 9 × 4
id time value diff
<int> <chr> <dbl> <dbl>
1 1 time2 5 500
2 2 time2 2 100
3 3 time2 6 200
4 1 time3 1 100
5 2 time3 2 100
6 3 time3 3 100
7 1 time4 10 1000
8 2 time4 4 200
9 3 time4 12 400

Related

reshape grouped data in R

I have the following data:
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1
id date value
1 2007-06-22 0
1 2007-06-22 3
1 2007-07-13 2
1 2007-07-13 4
2 2019-10-05 0
2 2019-10-05 1
2 2019-11-07 4
2 2019-11-07 2
2 2007-06-22 6
2 2007-06-22 8
I would like the data to look like this:
id <- c(1,1,2,2,2)
date <-as.Date(c("2007-06-22", "2007-07-13", "2019-10-05", "2019-11-07","2007-06-22"))
value.1 = c(0,2,0,4,6)
value.2 = c(3,4,1,2,8)
mydata_2 <- data.frame(id, date, value.1, value.2)
mydata_2
id date value.1 value.2
1 2007-06-22 0 3
1 2007-07-13 2 4
2 2019-10-05 0 1
2 2019-11-07 4 2
2 2007-06-22 6 8
I have tried below from (Reshaping data matrix in R) but since some of the dates are the same in the two different id's it is not working as intended
dateno <- with(mydata_1, ave(id, date, FUN = seq_along))
test2 <- transform(mydata_1, dateno = dateno)
reshape(test2, dir = "wide", idvar = c("id","date"), timevar = "dateno")
I think I have come up with an answer following this guide How to transpose a data frame by group using reshape2 library?
mydata_1 = mydata_1 %>% group_by(id,date) %>% mutate(id_2 = paste0("V",row_number()))
library(tidyr)
mydata_2 = spread(data = my, key = id_2, value = value)
mydata_2
id date V1 V2
<dbl> <date> <dbl> <dbl>
1 1 2007-06-22 0 3
2 1 2007-07-13 2 4
3 2 2007-06-22 6 8
4 2 2019-10-05 0 1
5 2 2019-11-07 4 2
Maybe sth. like this:
library(tidyverse)
id <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2)
date <- as.Date(c(
"2007-06-22", "2007-06-22", "2007-07-13", "2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22", "2007-06-22"
))
value <- c(0, 3, 2, 4, 0, 1, 4, 2, 6, 8)
mydata_1 <- data.frame(id, date, value)
mydata_1
mydata_1 %>%
group_by(id, date) %>%
mutate(visit = row_number()) %>%
complete(id, date, fill = list(value = 0)) %>%
pivot_wider(names_from = visit, values_from = value, names_prefix = "value.")
Created on 2021-11-25 by the reprex package (v2.0.1)
Another possible solution:
library(tidyverse)
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1 %>%
group_by(id, date) %>%
summarise(value = str_c(value, collapse = ","), .groups = "drop") %>%
separate(value, into=c("value1", "value2"), sep=",", convert = T)
#> # A tibble: 5 × 4
#> id date value1 value2
#> <dbl> <date> <int> <int>
#> 1 1 2007-06-22 0 3
#> 2 1 2007-07-13 2 4
#> 3 2 2007-06-22 6 8
#> 4 2 2019-10-05 0 1
#> 5 2 2019-11-07 4 2

Dplyr Summarize: Combining values for certain groups

I have data on hospital admissions per patients. I am trying add up the price of care for patients that were re-admitted to hospital within 5 days.
This is an example dataset:
(
dt <- data.frame(
id = c(1, 1, 2, 2, 3, 4),
admit_date = c(1, 9, 5, 9, 10, 20),
price = c(10, 20, 20, 30, 15, 16)
)
)
# id admit_date price
# 1 1 1 10
# 2 1 9 20
# 3 2 5 20
# 4 2 9 30
# 5 3 10 15
# 6 4 20 16
And this is what I have tried so far:
library(dplyr)
# 5-day readmission:
dt %>%
group_by(id) %>%
arrange(id, admit_date)%>%
mutate(
duration = admit_date - lag(admit_date),
readmit = ifelse(duration < 6, 1, 0)
) %>%
group_by(id, readmit) %>% # this is where i get stuck
summarize(sumprice = sum(price))
# # A tibble: 6 × 3
# # Groups: id [4]
# id readmit sumprice
# <dbl> <dbl> <dbl>
# 1 1 0 20
# 2 1 NA 10
# 3 2 1 30
# 4 2 NA 20
# 5 3 NA 15
# 6 4 NA 16
And this is what I would like to have:
# id sum_price
# 1 1 10
# 2 1 20
# 3 2 50
# 4 3 15
# 5 4 16
If the difference in days, between adjacent visits is greater than 5 - return TRUE if not - return FALSE (-Inf > 5 is FALSE for the first day, thus lags default is Inf). After that, for each individual we take a cumulative sum to label the groups. We finally summarize within each individual, using this cumsum as a grouping variable for by:
dt |>
group_by(id) |>
arrange(id, admit_date) |>
summarise(
sum_price = by(
price,
cumsum((admit_date - lag(admit_date, , Inf)) > 5),
sum
)
) |>
ungroup()
# # A tibble: 5 × 2
# id sum_price
# <dbl> <by>
# 1 1 10
# 2 1 20
# 3 2 50
# 4 3 15
# 5 4 16
So, you want (at most) one row per patient in the final dataframe, so you should group on just id.
Then, for each patient, you should calculate if that patient has any row with readmit==).
Finally, you filter out any patient that wasn't readmitted from your summarized dataframe.
Putting it all together, it might look like:
dt %>%
group_by(id) %>%
arrange(id, admit_date) %>%
mutate(duration = admit_date - lag(admit_date),
readmit = ifelse(duration < 6, 1, 0)) %>%
group_by(id) %>% # group by just 'id' to get one row per patient
summarize(sumprice = sum(price, na.rm = T),
is_readmit = any(readmit == 1)) %>% # If patient has any 'readmit' rows, count the patient as a readmit patient
filter(is_readmit) %>% # Filter out any non-readmit patients
select(-is_readmit) # get rid of the `is_readmit` column
Which should result in:
# A tibble: 1 x 3
id sumprice is_readmit
<dbl> <dbl> <lgl>
1 2 50 TRUE

Create a group variable based on different criteria of consecutive scores

I have a dataset that contains just the subject id and scores from different time points. Is there a way for me to create a group variable based on their scores? For example, if a subject has 6 consecutive scores of 1 or 2, I would put them in group "a" | if they had 4 consecutive scores of 3, I would put them in group "b" | if they had 6 consecutive scores of 4 or higher, I would put them in group "c".
Here is an example dataset:
id score1 score2 score3 score4 score5 score6 score7 score8 group
101 2 2 2 2 1 2 2 1 a
102 4 4 3 3 3 3 4 4 b
103 4 5 5 5 5 6 5 5 c
Here is the R code for the above table without the "group" column
structure(list(id = c(101, 102, 103), score1 = c(2, 4, 4), score2 = c(2,
4, 5), score3 = c(2, 3, 5), score4 = c(2, 3, 5), score5 = c(1,
3, 5), score6 = c(2, 3, 6), score7 = c(2, 4, 5), score8 = c(1,
4, 5)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
Any ideas are appreciated! Thank you so much :)
The function at the heart of all solutions is rle(). How you handle everything around is up to you.
library(tidyverse, quietly = TRUE)
score_df %>%
pivot_longer(score1:score8) %>%
mutate(value =
case_when(
value <= 2 ~ 1,
value >= 4 ~ 4,
TRUE ~ value
)) %>%
group_by(id) %>%
group_map(~{
r <- rle(.$value)
highest_val <- max(r$values)
longest_len <- max(r$lengths)
case_when(max(r$value) == 1 ~ "a",
any(r$lengths[which(r$value == 3)] >= 4) ~ "b",
any(r$lengths[which(r$value == 4)] >= 6) ~ "c",
TRUE ~ NA_character_)
}) %>%
unlist()
#> [1] "a" "b" "c"
Loop over the rows of numeric columns of the data with apply (MARGIN = 1), replace the values 1 to 2 to 1, and those that are greater than or equal to 4 to 4, then get the rle (run-length-encoding) on the replaced values in the row, extract the 'values' and 'lengths', create a logical expression based on the conditions specified in OP's post and return the desired group values if those conditions are met
library(dplyr)
df1$group <- apply(df1[-1], 1, function(x) {
x <- case_when(x %in% 1:2 ~ 1, x >=4 ~ 4, TRUE ~ x)
v1 <- rle(x)
na.omit(case_when(v1$values == 1 & v1$lengths >= 6 ~ 'a',
v1$values == 3 & v1$lengths >=4 ~ 'b',
v1$values ==4 & v1$lengths >= 6 ~ 'c' )) })
df1$group
#[1] "a" "b" "c"
Or using tidyverse
library(data.table)
library(tidyr)
df1 %>%
pivot_longer(cols = -id) %>%
mutate(newvalue = case_when(value %in% 1:2 ~ 1,
value >= 4 ~ 4, TRUE ~ value)) %>%
add_count(id, grp = rleid(newvalue)) %>%
group_by(id) %>%
summarise( group = first(na.omit(case_when(newvalue == 1 & n >= 6 ~ 'a',
newvalue == 3 & n >= 4 ~'b',
newvalue == 4 & n >= 6 ~ 'c'))), .groups = 'drop') %>%
left_join(df1, .)
-output
# A tibble: 3 x 10
# id score1 score2 score3 score4 score5 score6 score7 score8 group
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 101 2 2 2 2 1 2 2 1 a
#2 102 4 4 3 3 3 3 4 4 b
#3 103 4 5 5 5 5 6 5 5 c
Using base R, you could do:
pat <- c(a = "[12]{6}", b="3{4}", c="[4-9]{6}")
cbind(df, group = names(pat)[max.col(sapply(pat, grepl, do.call(paste0, df[-1])))])
id score1 score2 score3 score4 score5 score6 score7 score8 group
1 101 2 2 2 2 1 2 2 1 a
2 102 4 4 3 3 3 3 4 4 b
3 103 4 5 5 5 5 6 5 5 c

Transpose and sum distinct values in R

IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2
Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)
You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2
1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)

Different output between sum and +

I'm working on a problem that consists basically on sum all the rows based on their ID and sum some specific variables to get a consolidated dataset to input on another work, but there is an issue with the sum function and I'd appreciate some explanation about this.
Dataset:
teste <- data.frame(ID = c(1, 1, 2, 1, 3, 3, 2),
VALUE = c(10, 10, 10, 10, 10, 10, 10),
MOD = c(1, 1, 1, 1, 1, 1, 1))
ID VALUE MOD
1 1 10 1
2 1 10 1
3 2 10 1
4 1 10 1
5 3 10 1
6 3 10 1
7 2 10 1
Using + operator:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
mutate(CONS = VALUE + MOD)
# A tibble: 3 x 4
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 33
2 2 20 2 22
3 3 20 2 22
Using sum function:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
mutate(CONS = sum(VALUE, MOD))
# A tibble: 3 x 4
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 77
2 2 20 2 77
3 3 20 2 77
summarize_all removes one level of grouping so re-group it:
teste %>%
group_by(ID) %>%
summarise_all(sum, na.rm = TRUE) %>%
group_by(ID) %>% # <--------------------------
mutate(CONS = sum(VALUE, MOD)) %>%
ungroup
giving:
# A tibble: 3 x 4
# Groups: ID [3]
ID VALUE MOD CONS
<dbl> <dbl> <dbl> <dbl>
1 1 30 3 33
2 2 20 2 22
3 3 20 2 22

Resources