Using Geom_Segment in R - r

I am trying to create a plot in R using Geom_Segment. I am stuck with an error that says I need to input yend but I am inputting it already... this is my code:
library(ggplot2)
library(data.table)
library(magrittr)
dataset$From<-Sys.Date()
format(dataset$From, format="%Y-%m-%dT%H:%M:%OS")
dataset$To<-Sys.Date()
format(dataset$To, format="%Y-%m-%dT%H:%M:%OS")
ggplot(dataset, aes(x=datetime_start, y=dataset$Audit_Title,
color=dataset$Employee_Name)) +
geom_segment(aes(x=dataset$From,xend=dataset$To,y=dataset$Audit_Title,yend=dataset$Audit_Title),size=20)+
scale_colour_discrete(guide=guide_legend(override.aes=list(size=15))) +
ggtitle("Audit by Employee Timeline") + xlab("") + ylab("") + theme_bw()
SAMPLE DATA:
Here is the sample data
This is how I changed the code below to take in the data from Excel I inputted into Power BI:
library(ggplot2)
library(dplyr)
# transform into date
dataset <- dataset %>%
mutate_at(vars(dataset$From, dataset$To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
ggplot(dataset)+
geom_segment(aes(x=dataset$From, xend=dataset$To,
y=dataset$Employee_Name, yend=dataset$Employee_Name))

First of all, ideally you would share your data as a dput(dataset). If you can't share real data, you should make a minimal reproducible example and share that. See here
Here's your data
library(ggplot2)
library(dplyr)
df <-
read.table(
text =
"01/03/2020 03/16/2020 Supply_Chain John_Smith
05/08/2020 08/20/2020 Business_Unit Karen_Scott")
names(df) <- c("From", "To", "Audit_Title", "Employee_Name")
# transform into date
df <- df %>%
mutate_at(vars(From, To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
Now do the actual plot by selecting the proper x xend and having y be the employee (y=yend).
ggplot(df)+
geom_segment(aes(x=From, xend=To,
y=Employee_Name, yend=Employee_Name))
Which produces
If you want fancy colors, labels and stuff go ahead and check the proper documentation for ggplot. See here

Related

Conditional formatting with color coding in R

Hi I am trying to do something similar to Conditional formatting with color as in excel using R. I have a dataset with 0,1,2,3 as their values.
Opp <- c(10968788,11046809,11086342,11097787,11126732,11145638,11163014,11163034,11165910,11167232)
A <- c(1,2,3,2,3,2,2,2,2,2)
B <- c(1,2,3,3,3,2,2,2,2,2)
C <- c(1,2,3,3,3,2,2,2,2,0)
D <- c(2,2,3,3,3,2,3,3,3,3)
E <- c(2,2,3,3,3,3,3,0,3,0)
df <- data.frame(Opp,A,B,C,D,E)
Input
Expected Output
I tried to get the output using a heatmap but was not successful. Please help
library(tidyverse)
df %>%
pivot_longer(-Opp) %>%
mutate(across(everything(), factor)) %>%
ggplot() +
aes(name, Opp, fill = value) +
geom_tile()

Problem with the order of the x-axis in my plot

I have a problem with the order of the x-axis in my plot.
Download the data from the website of the Central Bank of PerĂº (GDP by quarter).
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
Use of json for download the data
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI)<- c("Quarter", "Millions")
As you see the plot is not in order in the x-axis.
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()
The x-axis is not in order because your character column becomes a factor and is sorted by alphanumeric ordering. So in your case "T1.19" would sort before "T2.18". To fix this, use library forcats and sort the factor based on the year. I split this out into its own column ("year") to make it clear what was being accomplished here.
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
library(forcats)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI) <- c("Quarter", "Millions")
dt_PBI$year <- substr(dt_PBI$Quarter, 4, 5)
dt_PBI$Quarter <- fct_reorder(dt_PBI$Quarter, dt_PBI$year, min)
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()

refining a simple code in R, it all works I want to polish it

here is my data
https://filebin.net/i2wpmeb19dacs3nr
it's very simple calculation to do but my code is messy.
#plot
library(ggplot2)
library(Hmisc)
library(svglite)
# Basic dot plot
#to use summary stat NA rows has to be delleted (or otherwise ignored)
leaf_count_jas_comlete<-leaf_count_jas[complete.cases(leaf_count_jas), ]
#all factors to character
leaf_count_jas_comlete <- data.frame(lapply(leaf_count_jas_comlete, as.character), stringsAsFactors=FALSE)
#change class for numbers
leaf_count_jas_comlete$leaf.no.<- as.numeric(leaf_count_jas_comlete$leaf.no.)
leaf_count_jas_comlete$height..cm.<- as.numeric(leaf_count_jas_comlete$height..cm.)
svg("rplot_height..cm..svg")
ggplot(leaf_count_jas_comlete, aes(x=genotype, y=height..cm.)) +
geom_dotplot(binaxis='y', stackdir='center', binwidth = 1, dotsize = 0.3)+
stat_summary(fun.y=mean, geom="point", shape=18,
size=3, color="blue") +
stat_summary(fun.data=mean_sdl, fun.args = list(mult=1),
geom="pointrange", color="blue")
dev.off()
#calculate mean and sd per group and export to a new table
library(plyr)
##add NA count
dt <- leaf_count_jas_comlete
jas_summary<-data.frame()
jas_summary_h<-ddply(dt,~genotype,summarise,mean=mean(height..cm.),sd=sd(height..cm.))
jas_summary_l<-ddply(dt,~genotype,summarise,mean=mean(leaf.no.),sd=sd(leaf.no.))
jas_summary_h_l<-merge(jas_summary_h,jas_summary_l, by="genotype", all=TRUE)
library(plyr)
n<-count(leaf_count_jas, "genotype")
dead<-aggregate(leaf.no. ~ genotype, leaf_count_jas, function(x) {sum(is.na(x))}, na.action = NULL)
jas_summary_h_l_dead<-merge(jas_summary_h_l,dead, by="genotype", all=TRUE)
jas_summary_h_l_dead_n<-merge(jas_summary_h_l_dead,n, by="genotype", all=TRUE)
I would like to start coding in simpler way. For example how to make an empty df
and fill it with the summary data so it has a column called "genotype", "mean_leaf_no", "leaf_no_sd", "mean_height", "height_sd", "no_plants" and "dead_plants" and gradually fill it with the calculations instead of merging new dfs? also now when I have it as it is I need to name the columns, but I would like this piece of code to be usable for many times so I would like the name to be constructed based on the original dfs col names (genotype, leaf_no, height) so for example if someone will measure "flowers_no" so it can keep the correct name all over to the summary table.
Help, please, help.
You should consider using the tidyverse if you'd like more simple code.
For example, the first part would be:
library(tidyverse); library(magrittr); library(janitor)
new_set = leaf_count_jas_comlete %>%
# clean names
clean_names() %>%
# remove rows with NA
drop_na() %>%
# convert factors to character
mutate_if(is.factor, as.character) %>%
#convert to numeric
mutate(leaf_no=as.numeric(leaf_no), height_cm=as.numeric(height_cm))
new_set %>%
group_by(genotype) %>%
summarize(mean_height=mean(height_cm), sd_height=sd(height_cm))

cumsum data over time by factor

I'm using the campaign contributions data from Oregon and I'm trying to make a graph that displays the cumulative amount of contributions per candidate over time. Here's what I have so far:
ggplot(aes(x = as.Date(contb_receipt_dt, "%d-%b-%y"),
y = cumsum(contb_receipt_amt)),
data = subset(oregon_data,
table(oregon_data$cand_nm)[oregon_data$cand_nm] > 1000
& as.Date(contb_receipt_dt, "%d-%b-%y") > as.Date("2015-01-01")))
+ geom_line(aes(color = cand_nm), bins = 5)
This is what it looks like:
What I would like to see is a line for each candidate that starts off at 0 and slowly goes up with each additional contribution. What should I do?
I would use dplyr to calculate the cumsum column before sending it on to ggplot. This should give you enough to get sarted, however you will need to pretty it up and filter the data to get the results you are looking for:
WashingtonData <- read.csv("P00000001-WA.csv")
WashingtonData <- WashingtonData %>% arrange(contb_receipt_dt)
MyGraphData <- WashingtonData %>% group_by(cand_nm) %>% mutate(cumsum = cumsum(contb_receipt_amt))
g <- ggplot(data=MyGraphData, aes(y=cumsum, x=contb_receipt_dt, color=cand_nm)) + geom_line()
g

Substituting dates with number of days in time series

I have following data on student scores on several pretests before their true exam.
a<-(c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b<-c(300,230,400,NA,NA,NA,"2013-04-30")
c<-c(NA,260,410,420,NA,NA,"2013-05-30")
d<-c(300,230,400,NA,370,390,"2013-08-30")
df<-as.data.frame(rbind(b,c,d))
colnames(df)<-a
rownames(df)<-(c("student 1","student 2","student 3"))
The actual datasheet is much larger. Since the dates vary so much, and the timing between the pretests and to the exam are relatively similar, I would rather convert the true dates into the number of days before the exam, so that they are the new column names, not dates. I understand that this will merge some of the columns which is OK. How would I be able to do that?
This is another good use case for reshape2, because you want to go to long form for plotting. For example:
# you are going to need the student id as a field
df$student_id <- row.names(df)
library('reshape2')
df2 <- melt(df, id.vars = c('student_id','actual_exam_date'),
variable.name = 'pretest_date',
value.name = 'pretest_score')
# drop empty observations
df2 <- df2[!is.na(df2$pretest_score),]
# these need to be dates
df2$actual_exam_date <- as.Date(df2$actual_exam_date)
df2$pretest_date <- as.Date(df2$pretest_date)
# date difference
df2$days_before_exam <- as.integer(df2$actual_exam_date - df2$pretest_date)
# scores need to be numeric
df2$pretest_score <- as.numeric(df2$pretest_score)
# now you can make some plots
library('ggplot2')
ggplot(df2, aes(x = days_before_exam, y = pretest_score, col=student_id) ) +
geom_line(lwd=1) + scale_x_reverse() +
geom_vline(xintercept = 0, linetype = 'dashed', lwd = 1) +
ggtitle('Pretest Performance') + xlab('Days Before Exam') + ylab('Pretest Score')
Here is one way to approach this one. I am sure there are many others. I commented the code to explain what is going on at each step:
# Load two libraries you need
library(tidyr)
library(dplyr)
# Construct data frame you provided
a <- (c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b <- c(300,230,400,NA,NA,NA,"2013-04-30")
c <- c(NA,260,410,420,NA,NA,"2013-05-30")
d <- c(300,230,400,NA,370,390,"2013-08-30")
df <- as.data.frame(rbind(b,c,d))
colnames(df) <- a
# Add student IDs as a column instead of row names and move them to first position
df$StudentID <- row.names(df)
row.names(df) <- NULL
df <- select(df, StudentID, everything())
# Gather date columns as 'categories' with score as the new column value
newdf <- df %>% gather(Date, Score, -actual_exam_date, -StudentID) %>% arrange(StudentID)
# Convert dates coded as factor variables into actual dates so we can do days to exam computation
newdf$actual_exam_date <- as.Date(as.character(newdf$actual_exam_date))
newdf$Date <- as.Date(as.character(newdf$Date))
# Create a new column of days before exam per student ID (group) and filter
# out dates with missing scores for each student
newdf <- newdf %>% group_by(StudentID) %>% mutate(daysBeforeExam = as.integer(difftime(actual_exam_date, Date, units = 'days'))) %>% filter(!is.na(Score))
# Plot the trends using ggplot
ggplot(newdf, aes(x = daysBeforeExam, y = Score, col = StudentID, group = StudentID)) + geom_line(size = 1) + geom_point(size = 2)

Resources