How to use mutate function while excluding certain rows - r

So I am trying to create percentages to display on a plot.
Here is my dataset:
how_often_ByYear <- structure(list(Var1 = structure(c(1L, 2L, 3L, 1L, 2L, 3L), levels = c("A few times a year",
"Never been", "Once or twice"), class = "factor"), Var2 = structure(c(1L,
1L, 1L, 2L, 2L, 2L), levels = c("Year 1", "Year 2"), class = "factor"),
Freq = c(0, 122, 47, 1, 117, 50), percent = c(0, 72, 28,
1, 69, 30)), class = "data.frame", row.names = c(NA, -6L))
And here is my code:
how_often_ByYear <- Visitor_Data_ByYear %>%
dplyr::select(How_often_have_you_visited_us, Year) #selects column for question 16
#mutate_all(funs(gsub("[[:punct:]]", "", .))) #removes annoying symbols
how_often_ByYear <- table(how_often_ByYear$How_often_have_you_visited_us, how_often_ByYear$Year)
how_often_ByYear <- as.data.frame(how_often_ByYear)
how_often_ByYear <- how_often_ByYear %>%
mutate(percent = Freq/sum(Freq)*100) %>%
mutate_if(is.numeric, round, 0)
View(how_often_ByYear)
right now, the numbers include both year 1 and year 2, so my percentages add up to around 50 percent. How do I separate the percentages for each year so that I can report on both?
Thanks in advance for your help.

Related

R forestplot() function fix section widths

I want to generate multiple forest plots using the forestplot() function in R (from the package R/forestplot), and want to ensure that I can line up the text and graph sections in each so they can be usefully shown as a stacked plot, like this:
(taken from R forestplot package blank lines with section headings)
but with the possibility that there may be different scales in each subplot, lining up the zero effect line in each plot.
Which attributes need changing in the forestplot() call to ensure that this occurs?
EDIT: to provide a minimum code example
library(forestplot)
library(tidyr)
cohort <- data.frame(Age = c(43, 39, 34, 55, 70, 59, 44, 83, 76, 44,
75, 60, 62, 50, 44, 40, 41, 42, 37, 35, 55, 46),
Status = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L), levels = c("-", "+"), class = "factor"),
Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("1","2"), class = "factor"))
age.lm <- lm(Age ~ Group, data = cohort)
status.lm <- glm(Status ~ Group, data = cohort, family=binomial(link=logit))
age.data <- summary(age.lm)$coefficients[2,]
status.data <- summary(status.lm)$coefficients[2,]
age.data <- rbind(c(0,0,0,1,"Group 1", "n=15"),
c(age.data[1], age.data[1]-age.data[2]*1.95, age.data[1]+age.data[2]*1.95, age.data[4], "Group 2", "n=7"))
status.data <- rbind(c(0,0,0,1,"Group 1", "[+13,-2]"),
c(status.data[1], status.data[1]-status.data[2]*1.95, status.data[1]+status.data[2]*1.95, status.data[4], "Group 2", "[+2,-5]"))
colnames(age.data) <- c("mean","lower","upper","p-val","labeltext","numbers")
colnames(status.data) <- c("mean","lower","upper","p-val","labeltext","numbers")
age.data <- data.frame(age.data)
status.data <- data.frame(status.data)
age.data$mean <- as.numeric(age.data$mean)
age.data$lower <- as.numeric(age.data$lower)
age.data$upper <- as.numeric(age.data$upper)
status.data$mean <- exp(as.numeric(status.data$mean))
status.data$lower <- exp(as.numeric(status.data$lower))
status.data$upper <- exp(as.numeric(status.data$upper))
age.plot <- forestplot(age.data,
labeltext = c(labeltext,numbers),
boxsize = 0.1,
xlog = FALSE,
clip=c(-20,20),
xticks=c(-20,-10,0,10,20),
txt_gp = fpTxtGp(ticks=gpar(cex=1)),
align=c("l","c","l"))
status.plot <- forestplot(status.data,
labeltext = c(labeltext,numbers),
boxsize = 0.1,
xlog = TRUE,
clip=c(1/100,100),
xticks=c(log(1e-2),log(1e-1),0,log(1e1),log(1e2)),
txt_gp = fpTxtGp(ticks=gpar(cex=1)),
align=c("l","c","l"))
Note that the age plot is a linear model and the status plot is a logistic model:
I want to be able to arrange the relative sizes of the text to the left and the plot to the right in order that the zero-effect lines (at 0 and at 1 respectively) line up so that the forest plots stack cleanly.
With the align argument you could left "l" align the parts of your plot, so the text can be left aligned. If you want to align your zero-effect lines you could use mar and play with the units to adjust one of the graphs. Here is a reproducible example:
library(forestplot)
library(tidyr)
age.plot <- forestplot(age.data,
labeltext = c(labeltext,numbers),
boxsize = 0.1,
xlog = FALSE,
clip=c(-20,20),
xticks=c(-20,-10,0,10,20),
txt_gp = fpTxtGp(ticks=gpar(cex=1)),
align=c("l","l","l")
)
status.plot <- forestplot(status.data,
labeltext = c(labeltext,numbers),
boxsize = 0.1,
xlog = TRUE,
clip=c(1/100,100),
xticks=c(log(1e-2),log(1e-1),0,log(1e1),log(1e2)),
txt_gp = fpTxtGp(ticks=gpar(cex=1)),
align=c("l","l","l"),
mar = unit(c(0,5,0,10.5), "mm")
)
library(grid)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 1)))
pushViewport(viewport(layout.pos.row = 1))
plot(age.plot)
popViewport()
pushViewport(viewport(layout.pos.row = 2))
plot(status.plot)
popViewport(2)
Created on 2022-12-28 with reprex v2.0.2

How to order a plot from lowest to highest?

I looked at other threads where this question is answered,but i couldn't adapt the code. I plot the following graph below. I try then to order from lowest to highest according to the blue color (education==3) when time is at 0. I use the following code to create the order.
country_order <- df %>%
filter(education == 3 & time==0) %>%
arrange(unemployment) %>%
ungroup() %>%
mutate(order = row_number())
However, i am not sure how to introduce the new variable order into ggplot to get the ordering i want. Could someone help?
Here is the plot
ggplot(df, aes(y=unemployment, x=time, fill= education)) +
geom_col(, color = "black") +
facet_wrap(~ country)
Here is the data:
df= structure(list(time = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
unemployment = structure(c(25, 35, 40, 10, 20, 70, 20, 25,
55, 23, 17, 60), format.stata = "%9.0g"), education = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1",
"2", "3"), class = "factor"), country = structure(c(1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2), format.stata = "%9.0g")), row.names = c(NA,
-12L), class = "data.frame")
I think you can use fct_reorder() to reorder factor levels of the desired variable by sorting along another variable.
df %>%
ggplot(aes(y=unemployment, x=time, fill= fct_reorder(education, unemployment, .desc = T))) +
geom_col(, color = "black") +
facet_wrap(~ country)

Ggplot stacked bar plot with percentage labels

I am trying to do a stacked bar plot based on count, but with the labels showing the percentage on the plot. I have produced the plot below. However the percentage is based on all of the data. What I am after is the percentage by team (such that the sum of the percentages for Australia = 100% and the percentages for England = 100%).
The code for achieving this is the following function. This function counts the number of different roles in each team across 5 matches (I have had to divide the result by 10 as a players role appears twice for each match (5 matches x 2 appearances):
team_roles_Q51 <- function(){
ashes_df <- tidy_data()
graph <- ggplot(ashes_df %>%
count(team, role) %>% #Groups by team and role
mutate(pct=n/sum(n)), #Calculates % for each role
aes(team, n, fill=role)) +
geom_bar(stat="identity") +
scale_y_continuous(labels=function(x)x/10) + #Needs to be a better way than dividing by 10
ylab("Number of Participants") +
geom_text(aes(label=paste0(sprintf("%1.1f", pct*100),"%")),
position=position_stack(vjust=0.5)) +
ggtitle("England & Australia Team Make Up") +
theme_bw()
print(graph)
}
An example of the dataframe that is imported is:
Structure for the first 10 rows of the dataframe as follows:
structure(list(batter = c("Ali", "Anderson", "Bairstow", "Ball",
"Bancroft", "Bird", "Broad", "Cook", "Crane", "Cummins"), team = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("Australia",
"England"), class = "factor"), role = structure(c(1L, 3L, 4L,
3L, 2L, 3L, 3L, 2L, 3L, 3L), .Label = c("allrounder", "batsman",
"bowler", "wicketkeeper"), class = "factor"), innings = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("test_1_innings_1",
"test_1_innings_2", "test_2_innings_1", "test_2_innings_2", "test_3_innings_1",
"test_3_innings_2", "test_4_innings_1", "test_4_innings_2", "test_5_innings_1",
"test_5_innings_2"), class = "factor"), batting_num = c(6, 11,
7, 10, 1, NA, 9, 1, NA, 9), score = c(38, 5, 9, 14, 5, NA, 20,
2, NA, 42), balls_faced = c(102, 9, 24, 11, 19, NA, 32, 10, NA,
120)), row.names = c(NA, 10L), class = "data.frame")
Any help would be appreciated. Thanks
You need to group_by team to calculate the proportion and use pct in aes :
library(dplyr)
library(ggplot2)
ashes_df %>%
count(team, role) %>%
group_by(team) %>%
mutate(pct= prop.table(n) * 100) %>%
ggplot() + aes(team, pct, fill=role) +
geom_bar(stat="identity") +
ylab("Number of Participants") +
geom_text(aes(label=paste0(sprintf("%1.1f", pct),"%")),
position=position_stack(vjust=0.5)) +
ggtitle("England & Australia Team Make Up") +
theme_bw()

connect points within position_dodged factor x-axis in ggplot2

I'm trying to add significance annotations to an errorbar plot with a factor x-axis and dodged groups within each level of the x-axis. It is a similar but NOT identical use case to this
My base errorbar plot is:
library(ggplot2)
library(dplyr)
pres_prob_pd = structure(list(x = structure(c(1, 1, 1, 2, 2, 2, 3, 3, 3), labels = c(`1` = 1,
`2` = 2, `3` = 3)), predicted = c(0.571584427222816, 0.712630712634987,
0.156061969566517, 0.0162388386564817, 0.0371877245103279, 0.0165022541901018,
0.131528946944238, 0.35927812866896, 0.0708662221985375), std.error = c(0.355802875027348,
0.471253661425626, 0.457109887762665, 0.352871728451576, 0.442646879181155,
0.425913568532558, 0.376552208691762, 0.48178172708116, 0.451758041335245
), conf.low = c(0.399141779923204, 0.496138837620712, 0.0701919316506831,
0.00819832576725402, 0.0159620304815404, 0.00722904089045731,
0.0675129352870401, 0.17905347369819, 0.030504893442457), conf.high = c(0.728233665534388,
0.861980236164486, 0.311759350126477, 0.031911364587827, 0.0842227723261319,
0.0372248587668487, 0.240584344249407, 0.590437963881823, 0.156035177669385
), group = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain",
"neutral", "uncertain"), class = "factor"), group_col = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain", "neutral",
"uncertain"), class = "factor"), language = structure(c(2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("english", "dutch", "german"
), class = "factor"), top = c(0.861980236164486, 0.861980236164486,
0.861980236164486, 0.0842227723261319, 0.0842227723261319, 0.0842227723261319,
0.590437963881823, 0.590437963881823, 0.590437963881823)), row.names = c(NA,
-9L), groups = structure(list(language = structure(1:3, .Label = c("english",
"dutch", "german"), class = "factor"), .rows = structure(list(
4:6, 1:3, 7:9), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#dodge
pd = position_dodge(.75)
#plot
p = ggplot(pres_prob_pd,aes(x=language,y=predicted,color=group,shape=group)) +
geom_point(position=pd,size=2) +
geom_errorbar(aes(ymax=conf.high,ymin=conf.low),width=.125,position=pd)
p
What I want to do is annotate the plot such that the contrasts between group within each level of language are annotated for significance. I've plotted points representing the relevant contrasts and (toy) sig. annotations as follows:
#bump function
f = function(x){
v = c()
bump=0.025
constant = 0
for(i in x){
v = c(v,i+constant+bump)
bump = bump + 0.075
}
v
}
#create contrasts
combs = data.frame(gtools::combinations(3, 2, v=c("certain", "neutral", "uncertain"), set=F, repeats.allowed=F)) %>%
mutate(contrast=c("cont_1","cont_2","cont_3"))
combs = rbind(combs %>% mutate(language = 'english'),
combs %>% mutate(language='dutch'),
combs %>% mutate(language = "german")) %>%
left_join(select(pres_prob_pd,language:top)%>%distinct(),by='language') %>%
group_by(language)
#long transform and calc y_pos
combs_long = mutate(combs,y_pos=f(top)) %>% gather(long, probability, X1:X2, factor_key=TRUE) %>% mutate(language=factor(language,levels=c("english","dutch","german"))) %>%
arrange(language,contrast)
#back to wide
combs_wide =combs_long %>% spread(long,probability)
combs_wide$p = rep(c('***',"*","ns"),3)
#plot
p +
geom_point(data=combs_long,
aes(x = language,
color=probability,
shape=probability,
y=y_pos),
inherit.aes = T,
position=pd,
size=2) +
geom_text(data=combs_wide,
aes(x=language,
label=p,
y=y_pos+.025,
group=X1),
color='black',
position=position_dodge(.75),
inherit.aes = F)
What I am failing to achieve is plotting a line connecting each of the contrasts of group within each level of language, as is standard when annotating significant group-wise differences. Any help much appreciated!

How do I write a function to manipulate several dataframes the same way?

Complete novice. I do not know how to write a function. I have several dataframes that all need to be manipulated in the same way and the output should be dataframes with the same names. I have functioning code that can manipulate a single dataframe. I would like to be able to manipulate several at once.
Here are 2 example df's:
ex1 <- structure(list(info1 = c("Day", "2018.04.03 10:47:33", "2018.04.03 11:20:04", "2018.04.03 11:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
ex2 <- structure(list(info1 = c("Day", "2018.04.10 12:47:33", "2018.04.10 13:20:04", "2018.04.10 13:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
Here is the functioning code to manipulate 'ex1'
library(tidyverse)
library(lubridate)
colnames(ex1) <- ex1[1,]
ex1 <- ex1 %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
colnames(ex1)[-1] <- paste("raw", colnames(ex1)[-1], sep = "_")
Secondary question: let's say I wanted to change the function so it accepted a df, but also a type (i.e., raw or comp) and the function input would be tidydatafunc(df, type). If I input type=comp it would change the last line of the code where I have "raw" to "comp". How could I change the function to accomodate this?
Any help is greatly appreciated. I'm sure this is basic stuff for most of you!
Wrap your script in function and specify params.
my_fun <- function(df, type = 'comp') {
# basic input validation is extremely useful
stopifnot(is.data.frame(df))
stopifnot(is.character(type))
colnames(df) <- df[1,]
ex1 <- df %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
# pass the character type
colnames(df)[-1] <- paste(type, colnames(df)[-1], sep = "_")
return(df)
}
Then you can use it.
my_fun(ex1, "comp") # view
new_ex1 <- my_fun(ex1, "comp") # save to variable new_ex1

Resources