create a dataframe for multiple line plot for ggplot R - r

This question is about arranging data for a ggplot line plot. I have been doing this manually with excel and I want to work out a way to do this using r.
I have reviewed this post which is similar
Arrange dataframe format for ggplot - R
I have a dataset that looks like this:
]1
I want to convert it to a dataframe that is divided into the groups (N,A,G) and into age brackets and the proportion per age_group.
An example of what I am trying to achieve:
Appreciate your help.
Data:
structure(list(ID = 1:10, Age = c(9L, 16L, 12L, 13L, 29L, 24L,
23L, 24L, 16L, 40L), Sex = structure(c(1L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"), Age_group =
c(1L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 4L), N = c(1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L), A = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L), G = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L)), class = "data.frame",
row.names = c(NA,
-10L))

We can pivot to 'long' format with pivot_longer and then create a grouping variable with cut on the 'Age' and get the sum of 'n' and 'proportion'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = N:G, names_to = 'group', values_to = 'n') %>%
group_by(Age_group_new = cut(Age, breaks = c(-Inf, 0, seq(10, 70, by = 10), 100, Inf)), group) %>%
summarise(n = sum(n)) %>%
group_by(Age_group_new) %>%
mutate(proportion = n/sum(n),
proportion = replace(proportion, is.nan(proportion), 0))

Related

Add a new row on basis of column values in R

I am trying to get my head around this simple preprocessing task in R. I am trying to get the ideal value column as a row titled ideal in Product ID. I think the image below will shed more light on it.
> dput(df)
structure(list(Consumer = c(43L, 43L, 43L, 43L, 43L, 41L, 41L,
41L, 41L, 41L), Product = c(106L, 992L, 366L, 257L, 548L, 106L,
992L, 366L, 257L, 548L), Firm = c(1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L), Juicy = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L
), Sweet = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L), Ideal_Firm = c(1L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), Ideal_Juicy = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Ideal_Sweet = c(1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-10L))
Below is a solution:
df <- data.frame(
Consumer = c(rep(43, 5), rep(41, 5)),
Product = rep(sample(100:900,size = 5, replace = F), 2),
Firm = c(sample(rep(0:1, 5), replace = T)),
Juicy = c(sample(rep(0:1, 5), replace = T)),
Sweet = c(sample(rep(0:1, 5), replace = T)),
Ideal_Firm = 1,
Ideal_Juicy = c(rep(1, 5), rep(2, 5)),
Ideal_Sweet = c(rep(1, 5), rep(0, 5))
)
library(dplyr)
df <- merge(
# Bind the observation...
df %>% select(Consumer:Sweet) %>%
pivot_wider(id_cols = Consumer,names_from = Product,values_from = Firm:Sweet),
# ... to the ideal
df %>% group_by(Consumer) %>%
# Here I put mean, but it could be 1, median, min, max... If I understood correctly, it has to be 1?
summarise(across(Ideal_Firm:Ideal_Sweet, ~mean(.x))) %>%
# Rename so the column name has the form [characteristic]_ideal instead of Ideal_[characteristic]
# remove prefix Ideal_ ...
rename_at(.vars = vars(starts_with("Ideal_")),
.funs = funs(sub("Ideal_", "", .))) %>%
# ... add _Ideal as a suffix instead
rename_at(vars(-Consumer), function(x) paste0(x,"_Ideal"))
)
# Then manipulate to get into long form again
df <- df %>% pivot_longer(cols = !Consumer) %>%
separate(name, c("Characteristic", "Product")) %>%
pivot_wider(id_cols = Consumer:Product, names_from = Characteristic, values_from = value)
df

How to mutate a column using dplyr with a value when any of the columns contain a 1 otherwise 0

events <- structure(list(ID = c(3049951, 3085397, 3204081, 3262134,
3467254), TVTProcedureStartDate = structure(c(16210, 16238, 16322,
16420, 16546), class = "Date"), DCDate = structure(c(16213, 16250,
16326, 16426, 16560), class = "Date"), CE_EventOccurred = c(0L,
0L, 0L, 0L, 0L), CE_EventDate = c(0L, 0L, 0L, 0L, 0L), `Annular Dissection (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Aortic Dissection (In Hospital)` = c(0L, 0L,
0L, 1L, 0L), `Atrial Fibrillation (In Hospital)` = c(0L, 1L,
0L, 0L, 1L), `Bleeding at Access Site (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Cardiac Arrest (In Hospital)` = c(1L, 0L, 0L,
0L, 0L), `Conduction/Native Pacer Disturbance Req ICD (In Hospital)` = c(0L,
0L, 1L, 0L, 0L), `Conduction/Native Pacer Disturbance Req Pacer (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Endocarditis (In Hospital)` = c(0L, 0L, 0L,
0L, 0L), `GI Bleed (In Hospital)` = c(0L, 0L, 0L, 0L, 0L), `Hematoma at Access Site (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Ischemic Stroke (In Hospital)` = c(0L, 0L,
0L, 0L, 0L), `Major Vascular Complications (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Minor Vascular Complication (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Mitral Leaflet Injury - detected during surgery (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Mitral Subvalvular Injury -detected during surgery (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `New Requirement for Dialysis (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Other Bleed (In Hospital)` = c(0L, 0L, 0L,
0L, 0L), `Perforation with or w/o Tamponade (In Hospital)` = c(1L,
0L, 0L, 0L, 0L), `Retroperitoneal Bleeding (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Single Leaflet Device Attachment (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Unplanned Other Cardiac Surgery or Intervention (In Hospital)` = c(0L,
0L, 0L, 0L, 0L), `Unplanned Vascular Surgery or Intervention (In Hospital)` = c(0L,
0L, 0L, 1L, 0L)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254
)), class = "data.frame", row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254,
3467324, 3510387, 3586037, 3661089, 3668621, 3679485, 3737916,
3738064, 3960141, 4006862, 4018241, 4019056, 4025174, 4027490,
4050900, 4051101, 4096816, 4097119, 4097146, 4097180, 4098426,
4106410, 4109968, 4147466, 4198427, 4198450, 4198458, 4204554,
4208053, 4213116, 4218802, 4218854, 4223378, 4223415, 4243959,
4316979, 4341660, 4348676, 4413567, 4419513, 4421948, 4422768,
4426483, 4430159, 4431211, 4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA,
-53L), vars = "NCDRPatientID", labels = structure(list(NCDRPatientID = c(3049951,
3085397, 3204081, 3262134, 3467254, 3467324, 3510387, 3586037,
3661089, 3668621, 3679485, 3737916, 3738064, 3960141, 4006862,
4018241, 4019056, 4025174, 4027490, 4050900, 4051101, 4096816,
4097119, 4097146, 4097180, 4098426, 4106410, 4109968, 4147466,
4198427, 4198450, 4198458, 4204554, 4208053, 4213116, 4218802,
4218854, 4223378, 4223415, 4243959, 4316979, 4341660, 4348676,
4413567, 4419513, 4421948, 4422768, 4426483, 4430159, 4431211,
4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA,
-53L), vars = "NCDRPatientID", drop = TRUE), indices = list(0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10:12, 13L, 14L, 15L,
16:17, 18L, 19:21, 22L, 23L, 24L, 25:26, 27L, 28L, 29:30,
31L, 32:33, 34L, 35:38, 39L, 40:41, 42L, 43L, 44L, 45L, 46L,
47L, 48:50, 51:53, 54L, 55L, 56L, 57L, 58L, 59:60, 61L, 62L,
63:64, 65:66, 67:68, 69L, 70L, 71:72, 73L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 2L, 1L, 3L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 1L), biggest_group_size = 4L), indices = list(0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L,
51L, 52L), drop = TRUE, group_sizes = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), biggest_group_size = 1L), indices = list(0L, 1L, 2L, 3L, 4L), drop = TRUE, group_sizes = c(1L,
1L, 1L, 1L, 1L), biggest_group_size = 1L)
From this data, I need to create a column that has value 1 if any of the columns which ends in (in-hospital) contains 1 else 0.
I tried multiple things but either doesn't work or displays error
Error in mutate_impl(.data, dots) : Evaluation error: NA/NaN argument.
event %>% mutate(TR = rowSums(select_(.,6:n)))
Error in mutate_impl(.data, dots) : Column `TR` must be length 1 (the group size), not 53
event %>% mutate(TR = rowSums(.[6:ncol(.)]))
And some other variations of it to see if I can understand or make some sense, but it keeps running into the similar errors and problems
Another thing i tried was the following which seems to do the row sums, but it also adds the ID even when I'm doing the following:
event %>% select(6:27) %>% rowSums()
but it added the ID with the 1s and 0s from columns 6 to 27 for each row. Not sure why it's doing this.
I want the results as a data frame with the same data, but also a column with 1s if any of the columns from 6 to 27 contains 1 otherwise 0
Before I developed my solution, I ran the following code to ungroup your data.
library(dplyr)
events <- events %>% ungroup()
Solution 1: rowSums with selected columns
The idea of this solution is to use rowSums to add all the numbers from the selected columns, determine if the sum is larger than 0, and then convert the logical vector to an integer vector (with 1 or 0).
There are many ways to select the columns. We can select based on column numbers.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., 6:27)) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use ends_with.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., ends_with("(In Hospital)"))) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use matches. The regular expression \\(In Hospital\\)$ indicates the string at the end.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., matches("\\(In Hospital\\)$"))) > 0))
events2$Col
# [1] 1 1 1 1 1
We can use contains, but notice that the target string does not need to be in the end of the column names.
events2 <- events %>% mutate(Col = as.integer(rowSums(select(., contains("(In Hospital)"))) > 0))
events2$Col
# [1] 1 1 1 1 1
Solution 2: apply with max
Since the numbers from the target columns are all 1 or 0, we can use apply with max to get the maximum, which will be 1 if there ara any 1, or 0. All the ways to use the select function as was shown above will also work here. Below I presented one way to do this.
events2 <- events %>% mutate(Col = apply(select(., ends_with("(In Hospital)")), 1, max))
events2$Col
# [1] 1 1 1 1 1
It is not a dplyr way, but it also works:
events$new_col <- 0
events$new_col[rowSums(events[, grep("In Hospital", colnames(events))]) >= 1] <- 1
A solution from base R using apply()
cols <- grep("in hospital", colnames(events), ignore.case = T)
apply(events[, cols], 1, function(x) ifelse(any(x == 1), 1, 0))
# [1] 1 1 1 1 1

Passing the list of strings as input to a function

I am trying automate a simple task in R using a function.
C is list of character variables. mydata- is the dataset.
Basically, I need to give each of the strings in vector C as an input to the function.
dataset:
mydata <- structure(list(a = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L), b = c(4L,3L, 1L, 2L, 1L, 5L, 2L, 2L), c = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L), d = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), t = c(42L, 34L, 74L,39L, 47L, 8L, 36L, 39L), s = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("a", "b", "c", "d", "t", "s"), row.names = c(NA,8L), class = "data.frame")
code:
c<-c("a","b","c","d")
plot<-function()
for (i in c)
{
fit<-survfit(Surv(s,t)~paste(i), dat=mydata)
ggsurvplot(fit, pval = TRUE)
}
plot()
I m facing the following error:
Error in model.frame.default(formula = Surv(mydata$s, mydata$t) ~
paste(i), : variable lengths differ (found for 'paste(i)')
I have tried the reformulate as well:
plot<-function()
for (i in c)
{
survfit(update(Surv(s,t)~., reformulate(i)), data=mydata)
ggsurvplot(fit, pval = TRUE)
}
plot()
but this code also gives this error:
Error in reformulate(i) : object 'i' not found
Any help to make this code work?
Thanks
Building formulas dynamically can be tricky. Rather than
fit(Surv(mydata$s,mydata$t)~paste(i), dat=mydata)
use
fit(update(Surv(s,t)~., reformulate(i)), data=mydata)
You should avoid using $ with formulas. Here reformualte() helps to build a formula from a string and update combines parts of formulas. See the help pages for these functions if you would like more details.
Here's the full working version with the sample inout
#sample input
mydata <- structure(list(a = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L), b = c(4L,3L, 1L, 2L, 1L, 5L, 2L, 2L), c = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L), d = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), t = c(42L, 34L, 74L,39L, 47L, 8L, 36L, 39L), s = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("a", "b", "c", "d", "t", "s"), row.names = c(NA,8L), class = "data.frame")
c<-c("a","b","c","d")
and the code
library(survival)
library(survminer)
plot <- function() {
for (i in c) {
fit <- survfit(update(Surv(t,s)~., reformulate(i)), data=mydata)
ggsurvplot(fit)
}
}
plot()
When I copy/paste that into R I do not get any errors. You must be doing something different than the sample code you've posted.

Subset using 'IF' and 'BY' in R

For a sample dataframe:
df <- structure(list(id = 1:19, region.1 = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L
), .Label = c("AT1", "AT2", "AT3", "AT4", "AT5"), class = "factor"),
PoorHealth = c(0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L)), .Names = c("id", "region.1",
"PoorHealth"), class = "data.frame", row.names = c(NA, -19L))
I want to subset using the BY command, and hoped somebody may be able to help me.
I want to INCLUDE regions (regions.1) in df that satisfy this condition:
Less than (or equal to) 3 occurrences of '1' in the variable 'PoorHealth'
OR this condition:
Where N (i.e. the respondents in each region) is less than or equal to 6.
If anyone has any ideas to help me, I should be very grateful.
This should work. Dno if there is a cleaner way:
library(data.table)
setDT(df)
qualified_regions = df[,which((sum(PoorHealth==1) <=3 | .N <= 6)),region.1][,region.1]
df[region.1 %in% qualified_regions,]
E: I removed the !-mark because OP changed "EXCLUDE" to "INCLUDE" in the original question.

calculate the sum per 2 columns

I have the following data frame:
all <- structure(list(counts = c(0L, 0L, 3L, 0L, 2L, 0L), counts = c(0L,
2L, 1L, 0L, 5L, 1L), counts = c(1L, 9L, 17L, 0L, 7L, 2L), counts = c(2L,
1L, 13L, 0L, 7L, 5L), counts = c(1L, 1L, 3L, 0L, 2L, 10L), counts = c(0L,
2L, 2L, 0L, 8L, 9L), counts = c(0L, 4L, 4L, 0L, 4L, 0L), counts = c(0L,
2L, 3L, 0L, 7L, 1L), counts = c(0L, 2L, 0L, 0L, 3L, 8L), counts = c(1L,
3L, 3L, 0L, 4L, 13L), counts = c(0L, 6L, 12L, 0L, 3L, 2L), counts = c(0L,
7L, 6L, 0L, 4L, 2L), counts = c(1L, 0L, 1L, 0L, 2L, 5L), counts = c(1L,
1L, 2L, 0L, 3L, 6L), counts = c(0L, 2L, 1L, 1L, 2L, 0L), counts = c(0L,
4L, 1L, 0L, 4L, 0L), counts = c(0L, 2L, 1L, 0L, 3L, 3L), counts = c(0L,
1L, 1L, 0L, 2L, 1L), counts = c(0L, 3L, 1L, 0L, 5L, 0L), counts = c(0L,
4L, 5L, 0L, 1L, 0L), counts = c(0L, 2L, 5L, 0L, 8L, 23L), counts = c(0L,
0L, 2L, 0L, 1L, 7L), counts = c(1L, 0L, 0L, 0L, 1L, 2L), counts = c(0L,
0L, 0L, 0L, 1L, 0L)), .Names = c("counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts"
), row.names = c("1/2-SBSRNA4", "A1BG", "A1BG-AS1", "A1CF", "A2LD1",
"A2M"), class = "data.frame")
In this dataframe i need the sum of every 2 columns in the simplest form this can be done with: all[1] + all[2], all[3] + all[4] etc etc. then at the end i could cbind the new frames again but i now this can be done with something like aggregate or apply. Only i did not yet manage to succeed. My best try now is: allfinal <-aggregate( all ,FUN = sum,by=[1:2] ) I know this is not how it should work but cant figure out how to correctly use aggregate or (s)apply to do this. Any tips are appreciated!
As output i want to have a dataframe that holds the sum of 2 columns per 1 columns. The data.frame now has 24 columns so at the end i need 12 columns.
you can try this:
t(rowsum(t(all), gl(ncol(all)/2, 2)))
hth

Resources