I have a data frame with long format data as follows
ID Frame.No ROI.No Flux.med
01 1 1 78
01 1 2 76
01 2 1 80
01 2 2 80
01 3 1 89
01 3 2 80
27 1 1 60
27 1 2 68
27 4 1 80
27 4 2 89
For each "ID" I want to get the first and maximum Flux.med for both ROI 1 and 2 and put all these in a new dataframe. If I have a dataframe with just one subject (e.g. ID 01) I am able to identify the Flux.med values I need using the following code:
ROI1.baseline <- mydata %>%
filter(ROI.No == "ROI 1" & Frame.No == min(Frame.No))%>%
select(Flux.Med)
ROI1.max <- mydata%>%
filter(ROI.No == "ROI 1")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)
ROI2.baseline <- mydata%>%
filter(ROI.No == "ROI 2" & Frame.No == min(Frame.No))%>%
select(Flux.Med)
ROI.max <- mydata%>%
filter(ROI.No == "ROI 2")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)
But I need to do that for each ID and save the results in a dataframe.
Can I do this with a for loop?
We can get first and max value in each ID and ROI.No.
library(dplyr)
mydata %>%
group_by(ID, ROI.No) %>%
summarise(first_flux = first(Flux.med),
max_flux = max(Flux.med))
# ID ROI.No first_flux max_flux
# <int> <int> <int> <int>
#1 1 1 78 89
#2 1 2 76 80
#3 27 1 60 80
#4 27 2 68 89
Or using aggregate :
aggregate(Flux.med~ID + ROI.No, mydata, function(x) c(first = x[1], max = max(x)))
data
mydata <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 27L, 27L, 27L,
27L), Frame.No = c(1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 4L, 4L), ROI.No = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), Flux.med = c(78L, 76L, 80L,
80L, 89L, 80L, 60L, 68L, 80L, 89L)), class = "data.frame", row.names = c(NA,-10L))
We can use data.table
library(data.table)
setDT(df1)[, .(first_flux = first(Flux.med),
max_flux = max(Flux.med)), .(ID, ROI.No)]
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 27L, 27L, 27L,
27L), Frame.No = c(1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 4L, 4L), ROI.No = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), Flux.med = c(78L, 76L, 80L,
80L, 89L, 80L, 60L, 68L, 80L, 89L)), class = "data.frame",
row.names = c(NA,-10L))
Thanks for the suggestions. So this is how I did it in the end:
ROI1.baseline <- ldi_data %>%
group_by(ID)%>%
filter(ROI.No == "ROI 1" & Frame.No == min(Frame.No))%>% ###uses lowest number frame as baseline (not necessarily frame 1 if it was excluded)
select(Flux.Med)%>%
dplyr::rename(ROI1_baseline = Flux.Med)%>%
as.data.frame(ROI1.baseline)
ROI1.max <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 1")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)%>%
dplyr::rename(ROI1_max = Flux.Med)%>%
as.data.frame(ROI1.max)
ROI2.baseline <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 2" & Frame.No == min(Frame.No))%>%
select(Flux.Med)%>%
dplyr::rename(ROI2_baseline = Flux.Med)%>%
as.data.frame(ROI2.baseline)
ROI2.max <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 2")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)%>%
dplyr::rename(ROI2_max = Flux.Med)%>%
as.data.frame(ROI2.max)
summary <- Reduce(merge, list(ROI1.baseline, ROI1.max, ROI2.baseline, ROI2.max))
Related
[![enter image description here][2]][2][![i need help in writing gstummary r code to produce following table output.dummy table shown in above table][2]][2]
i need help in writing gstummary r code to produce following table output.dummy table shown in above table
[![enter image description here][2]][2]
library(gtsummary)
[![enter image description here][2]][2]
[![enter image description here][3]][3]
id
age
sex
country
edu
ln
ivds
n2
p5
1
a
M
eng
x
45
15
40
15
2
a
M
eng
x
23
26
70
15
4
a
M
eng
x
26
36
35
40
5
b
F
eng
x
26
25
36
47
6
b
F
wal
y
45
45
60
12
7
b
M
wal
y
60
25
36
15
8
c
M
wal
y
70
08
25
36
9
c
F
sco
z
80
25
36
15
10
c
F
sco
z
90
25
26
39
structure(list(id = 1:15, age = structure(c(1L, 1L, 2L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 2L), .Label = c("a", "b",
"c"), class = "factor"), sex = structure(c(2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
country = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 3L), .Label = c("eng", "scot", "wale"
), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L), .Label = c("x",
"y", "z"), class = "factor"), lon = c(45L, 23L,
25L, 45L, 70L, 69L, 90L, 50L, 62L, 45L, 23L, 25L, 45L, 70L,
69L), is = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L, 9L,
15L, 26L, 36L, 34L, 2L, 4L), n2 = c(40L, 70L, 50L, 60L,
30L, 25L, 80L, 89L, 10L, 40L, 70L, 50L, 60L, 30L, 25L), p5 = c(15L,
20L, 36L, 48L, 25L, 36L, 28L, 15L, 25L, 15L, 20L, 36L, 48L,
25L, 36L)), row.names = c(NA, 15L), class = "data.frame")
[
I made a table similar to what you have above (more similar to the table you had before you updated it). But I think it'll get you most of the way there.
The type of table you're requesting it something that is in the works. In the meantime, you will need to use the bstfun::tbl_2way_summary() function. This function exists in another package while we work to make it better before integrating with gtsummary.
library(bstfun) # install with `remotes::install_github("ddsjoberg/bstfun")`
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.1'
# add a column that is all the same value
trial2 <- trial %>% mutate(constant = TRUE)
# loop over each continuous variable, construct table, then merge them together
tbls_row1 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = grade, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# repeat for the second row
tbls_row2 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = stage, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# stack these tables
tbl_stacked <- tbl_stack(list(tbls_row1, tbls_row2))
# lastly, add calculated summary stats for categorical variables, and merge them
tbl_summary_stats <-
trial2 %>%
tbl_summary(
include = c(grade, stage),
missing = "no"
) %>%
modify_header(stat_0 ~ "**n (%)**") %>%
modify_footnote(everything() ~ NA)
tbl_final <-
tbl_merge(list(tbl_summary_stats, tbl_stacked)) %>%
modify_spanning_header(everything() ~ NA) %>%
# column spanning column headers
modify_spanning_header(
list(c(stat_1_1_2, stat_1_2_2) ~ "**Group 1**",
stat_1_3_2 ~ "**Group 2**")
)
Created on 2021-07-10 by the reprex package (v2.0.0)
I have a data.frame as shown below:
Basic requirement is to find average of "n" number of "value" after certain date per group.
For ex:, user provides:
Certain Date = Failure Date
n = 4
Hence, for A, the average would be (60+70+80+100)/4 ; ignoring NAs
and for B, the average would be (80+90+100)/3. Note for B, n=4 does not happen as there are only 3 values after the satisfied condition failuredate = valuedate.
Here is the dput:
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), FailureDate = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("1/5/2020", "1/7/2020"), class = "factor"), ValueDate = structure(c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("1/1/2020", "1/10/2020", "1/2/2020",
"1/3/2020", "1/4/2020", "1/5/2020", "1/6/2020", "1/7/2020", "1/8/2020",
"1/9/2020"), class = "factor"), Value = c(10L, 20L, 30L, 40L,
NA, 60L, 70L, 80L, NA, 100L, 10L, 20L, 30L, 40L, 50L, 60L, 70L,
80L, 90L, 100L)), class = "data.frame", row.names = c(NA, -20L
))
We could create an index with cumsum after grouping by 'Name', extract the 'Value' elements and get the mean
library(dplyr)
n <- 4
df1 %>%
type.convert(as.is = TRUE) %>%
group_by(Name) %>%
summarise(Ave = mean(head(na.omit(Value[lag(cumsum(FailureDate == ValueDate),
default = 0) > 0]), n), na.rm = TRUE))
# A tibble: 2 x 2
# Name Ave
# <chr> <dbl>
#1 A 77.5
#2 B 90
You can convert factor dates to the Date object and then compute averages of "n" numbers after FailureDate per group. Note that "n" numbers should exclude NA, so tidyr::drop_na() is used here.
library(dplyr)
df %>%
mutate(across(contains("Date"), as.Date, "%m/%d/%Y")) %>%
tidyr::drop_na(Value) %>%
group_by(Name) %>%
summarise(mean = mean(Value[ValueDate > FailureDate][1:4], na.rm = T))
# # A tibble: 2 x 2
# Name mean
# <fct> <dbl>
# 1 A 77.5
# 2 B 90
You can try this:
library(dplyr)
n <- 4
df %>%
mutate(condition = as.character(FailureDate) == as.character(ValueDate))
group_by(Name) %>%
mutate(condition = cumsum(condition)) %>%
filter(condition == 1) %>%
slice(-1) %>%
filter(!is.na(Value)) %>%
slice(1:n) %>%
summarise(mean_col = mean(Value))
> df
# A tibble: 2 x 2
Name mean_col
<fct> <dbl>
1 A 77.5
2 B 90
How can i replace the missing values for each group separately?
The reproducible example:
mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group.2 = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L,
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA,
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L,
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA,
-22L))
Now i found how to replace the missing values without groups.
library(dplyr)
mydata %>% mutate_at(vars(starts_with("x1")), funs(ifelse(is.na(.) & is.numeric(.) ,mean(., na.rm = TRUE),.)))
But i need to replace for each groups (group1,group2) separately.
edit to small dataset
structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), group.2 = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L), x1 = c(63L, 67L, 57L, NA, 65L, 75L, 57L, 80L, 42L,
NA, 35L, 80L), x2 = c(46L, 1L, NA, 41L, 80L, NA, 74L, 73L, NA,
13L, 83L, NA)), class = "data.frame", row.names = c(NA, -12L))
mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group2 = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L,
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA,
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L,
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA,
-22L))
library(tidyverse)
mydata %>%
unite(group, group1, group2) %>% # combine groups
mutate(id = row_number()) %>% # add the row number as an id (useful when reshaping)
gather(var, value, -group, -id) %>% # reshape data
group_by(group, var) %>% # for each group combination and variable
mutate(value = ifelse(is.na(value), mean(value, na.rm = T), value)) %>% # replace NAs with mean
spread(var, value) %>% # reshape again
arrange(id) %>% # keep order of original dataset
select(-id) %>% # remove id
ungroup() %>% # forget the grouping
separate(group, c("group1","group2")) # split the groups again
# # A tibble: 22 x 4
# group1 group2 x1 x2
# <chr> <chr> <dbl> <dbl>
# 1 1 1 20 44
# 2 1 2 4 94
# 3 1 1 91 61.3
# 4 1 2 36.5 1
# 5 1 1 94 67
# 6 1 2 69 39
# 7 1 1 38 73
# 8 1 2 36.5 22
# 9 2 1 29 44
# 10 2 2 69 24
# # ... with 12 more rows
I would like to create a new column Trial based on the ending of a character string. For example, the characters that end with the number 2 like A3-H2 or A9-H2 will be considered Trial 2 and those not ending with a number like A3-H or A9-H will be considered Trial 1. This should be an easy ifelse statement but I don't know how to do it based on the end of a character string.
It would go from this:
Plant Trtmt
1: SC A3-H
2: SC A3-H2
3: SC A9-H
4: SC A9-H2
To this:
Plant Trtmt Trial
1: SC A3-H 1
2: SC A3-H2 2
3: SC A9-H 1
4: SC A9-H2 2
Real Data:
dput(stack.df)
structure(list(Plant = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("SC",
"W"), class = "factor"), Trtmt = c("A3-H", "A3-H", "A3-H", "A3-H",
"A3-H", "A9-H", "A9-H", "A9-H", "A9-H", "A9-H", "A3-H2", "A3-H2",
"A3-H2", "A3-H2", "A3-H2", "A9-H2", "A9-H2", "A9-H2", "A9-H2",
"A9-H2")), .Names = c("Plant", "Trtmt"), row.names = c(6L, 7L,
8L, 9L, 10L, 16L, 17L, 18L, 19L, 20L, 66L, 67L, 68L, 69L, 70L,
76L, 77L, 78L, 79L, 80L), class = "data.frame")
library(tidyverse)
stack.df <- stack.df %>%
mutate(Trial = ifelse(grepl("2$", Trtmt), 2, 1))
library(dplyr)
library(stringr)
stack.df %>% mutate(Trial = ifelse(str_sub(Trtmt,-1)=="2", 2, 1))
Here is one option with str_extract
library(stringr)
library(data.table)
setDT(stack.df)[, Trial := pmax(as.numeric(str_extract(Trtmt, "\\d+$")), 1, na.rm = TRUE)]
You can get the last character with substr(stack.df$Trtmt,nchar(stack.df$Trtmt)-1,nchar(stack.df$Trtmt))
Like you said, it's an easy ifelse from there :-)
This is a follow up question to the following problem give here
I have the following data
Data:
df = structure(list(Org_ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L),
Market_volume = c(100L, 200L, 300L, 50L, 500L, 400L, 200L,
300L, 100L), Indicator_variable = c(1L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L),variable3=c(10L, 1L, 1L, 4L, 2L, 3L, 3L, 10L, 3L),variable4=c(2L, 1L, 1L, 7L, 2L, 3L, 3L, 8L, 3L)).Names = c("Org_ID", "Market_volume", "Indicator_variable","Var3","Var4"
), class = "data.frame", row.names = c(NA, -9L))
Using (dplyr), i calculated the % of NA's by market volume by Org_ID via the following function
df %>%
group_by(Org_ID) %>%
summarize(sum_market_vol = sum(Market_volume*!Indicator_variable),
tot_market_vol = sum(Market_volume)) %>%
transmute(Org_ID, Perc_Market_Vol = 100*sum_market_vol/tot_market_vol)
Result:
# A tibble: 3 x 2
Org_ID Perc_Market_Vol
<int> <dbl>
1 1 83.33333
2 2 0.00000
3 3 100.00000
Question:
I want to subset my original data by deleting all rows of Org_ID (say 2) # X if perc_market_vol<30. That is i do not want to delete individual rows of the same org_id, but Org_id as a whole, say all counts of Org_id =1 or org_id = 2. How can i subset it linking two tables or functions?
I want the new data look like this:
df1 = structure(list(Org_ID = c(1L, 1L, 1L, 3L, 3L, 3L, 3L),
Market_volume = c(100L, 200L, 300L, 400L, 200L,
300L, 100L), Indicator_variable = c(1L, 0L, 0L, 0L,
0L, 0L, 0L),variable3=c(10L, 1L, 1L, 3L, 3L, 10L, 3L),variable4=c(2L, 1L, 1L, 3L, 3L, 8L, 3L)).Names = c("Org_ID", "Market_volume", "Indicator_variable","Var3","Var4"
), class = "data.frame", row.names = c(NA, -7L))
You can filter without materializing the aggregated data frame by using group_by %>% filter, and in the filter you can calculate the aggregated condition per group:
df %>%
group_by(Org_ID) %>%
filter(sum(Market_volume * !Indicator_variable)/sum(Market_volume) > 0.3)
# A tibble: 7 x 5
# Groups: Org_ID [2]
# Org_ID Market_volume Indicator_variable Var3 Var4
# <int> <int> <int> <int> <int>
#1 1 100 1 10 2
#2 1 200 0 1 1
#3 1 300 0 1 1
#4 3 400 0 3 3
#5 3 200 0 3 3
#6 3 300 0 10 8
#7 3 100 0 3 3