Problem with the order of the x-axis in my plot - r

I have a problem with the order of the x-axis in my plot.
Download the data from the website of the Central Bank of PerĂº (GDP by quarter).
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
Use of json for download the data
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI)<- c("Quarter", "Millions")
As you see the plot is not in order in the x-axis.
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()

The x-axis is not in order because your character column becomes a factor and is sorted by alphanumeric ordering. So in your case "T1.19" would sort before "T2.18". To fix this, use library forcats and sort the factor based on the year. I split this out into its own column ("year") to make it clear what was being accomplished here.
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
library(forcats)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI) <- c("Quarter", "Millions")
dt_PBI$year <- substr(dt_PBI$Quarter, 4, 5)
dt_PBI$Quarter <- fct_reorder(dt_PBI$Quarter, dt_PBI$year, min)
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()

Related

Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous

I am Asheesh from India. I am new to programming in data science. While completing a project I encountered an error "Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous" while running following code.
ggplot(data = cyclistic_data ) + geom_bar(aes(x = member_casual, y= ride_length))
I am also sharing my all code chunks if you may need a reference
install.packages("tidyverse")
install.packages("lubridate")
install.packages("dplyr")
install.packages("ggplot2")
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(readr)
setwd("C:\\Users\\sashe\\Desktop\\Data Analytics\\8_Capstone\\New folder")
data_2021_6 <- read.csv("202106-divvy-tripdata.csv")
data_2021_7 <- read.csv("202107-divvy-tripdata.csv")
data_2021_8 <- read.csv("202108-divvy-tripdata.csv")
data_2021_9 <- read.csv("202109-divvy-tripdata.csv")
data_2021_10 <- read.csv("202110-divvy-tripdata.csv")
data_2021_11 <- read.csv("202111-divvy-tripdata.csv")
data_2021_12 <- read.csv("202112-divvy-tripdata.csv")
data_2022_1 <- read.csv("202201-divvy-tripdata.csv")
data_2022_2 <- read.csv("202202-divvy-tripdata.csv")
data_2022_3 <- read.csv("202203-divvy-tripdata.csv")
data_2022_4 <- read.csv("202204-divvy-tripdata.csv")
data_2022_5 <- read.csv("202205-divvy-tripdata.csv")
combined_data<- rbind(data_2021_6, data_2021_7, data_2021_8, data_2021_9, data_2021_10, data_2021_11, data_2021_12, data_2022_1, data_2022_2, data_2022_3, data_2022_4, data_2022_5)
combined_data$ride_length <- difftime(combined_data$ended_at, combined_data$started_at)
#create columnds for: day of week, month, day, year, time
combined_data$date <- as.Date(combined_data$started_at) #default format is yyyy-mm-dd, use start date
combined_data$day_of_week <- wday(combined_data$started_at) #calculate the day of the week
combined_data$day_of_week <- format(as.Date(combined_data$date), "%A") #create column for day of week
combined_data$month <- format(as.Date(combined_data$date), "%m")#create column for month
combined_data$day <- format(as.Date(combined_data$date), "%d") #create column for day
combined_data$year <- format(as.Date(combined_data$date), "%Y") #create column for year
#clean the data
combined_data <- na.omit(combined_data) #remove rows with NA values
combined_data <- distinct(combined_data) #remove duplicate rows
combined_data <- combined_data[!(combined_data$ride_length <=0),] #remove where ride_length is 0 or negative
combined_data <- combined_data %>% #remove columns not needed: ride_id, start_station_id, end_station_id, start_lat, start_long, end_lat, end_lng
select(-c(ride_id, start_station_id, end_station_id,start_lat,start_lng,end_lat,end_lng))
cyclistic_data <- combined_data
ggplot(data = cyclistic_data ) + geom_bar(aes(x = member_casual, y= ride_length))
I would be grateful, if you may help me on this.

Using Geom_Segment in R

I am trying to create a plot in R using Geom_Segment. I am stuck with an error that says I need to input yend but I am inputting it already... this is my code:
library(ggplot2)
library(data.table)
library(magrittr)
dataset$From<-Sys.Date()
format(dataset$From, format="%Y-%m-%dT%H:%M:%OS")
dataset$To<-Sys.Date()
format(dataset$To, format="%Y-%m-%dT%H:%M:%OS")
ggplot(dataset, aes(x=datetime_start, y=dataset$Audit_Title,
color=dataset$Employee_Name)) +
geom_segment(aes(x=dataset$From,xend=dataset$To,y=dataset$Audit_Title,yend=dataset$Audit_Title),size=20)+
scale_colour_discrete(guide=guide_legend(override.aes=list(size=15))) +
ggtitle("Audit by Employee Timeline") + xlab("") + ylab("") + theme_bw()
SAMPLE DATA:
Here is the sample data
This is how I changed the code below to take in the data from Excel I inputted into Power BI:
library(ggplot2)
library(dplyr)
# transform into date
dataset <- dataset %>%
mutate_at(vars(dataset$From, dataset$To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
ggplot(dataset)+
geom_segment(aes(x=dataset$From, xend=dataset$To,
y=dataset$Employee_Name, yend=dataset$Employee_Name))
First of all, ideally you would share your data as a dput(dataset). If you can't share real data, you should make a minimal reproducible example and share that. See here
Here's your data
library(ggplot2)
library(dplyr)
df <-
read.table(
text =
"01/03/2020 03/16/2020 Supply_Chain John_Smith
05/08/2020 08/20/2020 Business_Unit Karen_Scott")
names(df) <- c("From", "To", "Audit_Title", "Employee_Name")
# transform into date
df <- df %>%
mutate_at(vars(From, To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
Now do the actual plot by selecting the proper x xend and having y be the employee (y=yend).
ggplot(df)+
geom_segment(aes(x=From, xend=To,
y=Employee_Name, yend=Employee_Name))
Which produces
If you want fancy colors, labels and stuff go ahead and check the proper documentation for ggplot. See here

How to change axis labels from numeric dates to string in ggplot?

I have this simple example of a boxplot:
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
ggplot(mytable, aes(x=age.class, y=date.numeric)) +
geom_boxplot()
My variable date.numeric is depicted as numbers in the plot, in which date number 1 represents date 1/1/2015 (reference date). How can I change the y-axis to show dates in format "month-year" instead of the numeric format?
try as.Date()
library(ggplot2)
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
mytable$date <- (as.Date(date.numeric,origin = "2015/1/1"))
ggplot(mytable, aes(x=age.class, y=date)) +
geom_boxplot()
Created on 2018-07-17 by the reprex package (v0.2.0.9000).
try creating a date offset variable and add that to your y-axis.
date.start <- as.Date('2015-01-01')
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
ggplot(mytable, aes(x=age.class, y=date.numeric+date.start)) + geom_boxplot()
The axis would then look like Apr 2015, etc.

Combine multiple plots, generated using the "by" R function, in one figure

I have a data frame containing multiple numeric columns and one column with different factors. I'd like to produce a unique image containing the plots of the numeric columns, by factor. I tried the following:
varA <- runif(40)
varB <- runif(40)
varB <- runif(40)
varC <- runif(40)
mainVar <- c(rep('cat', 10), rep('dof', 10), rep('mouse', 10), rep('frog', 10))
plotData <- data.frame(varA, varB, varC, mainVar)
pdf('asd.pdf')
par(mfrow=c(2,2))
by(plotData, plotData$mainVar, function(x){
par(mfrow=c(1,3))
boxplot(x$varA)
boxplot(x$varB)
boxplot(x$varC)
})
dev.off()
It produces a unique pdf, but with a page for every factor.
Instead, I'd like to get something like that (without the red lines):
First, both techniques shown here prefer data in a "tall" format. There are several tools that will reshape it for you, I'll use
# library(tidyr)
plotDataTall <- tidyr::gather(plotData, k, v, -mainVar)
head(plotDataTall)
# mainVar k v
# 1 cat varA 0.4023846
# 2 cat varA 0.3406813
# 3 cat varA 0.7990530
# 4 cat varA 0.3706167
# 5 cat varA 0.5986029
# 6 cat varA 0.1626782
Other tools include the reshape2 package or the stats function reshape, both of which are increasingly less-intuitive to use for first time users.
ggplot2
library(ggplot2)
ggplot(plotDataTall, aes(x = k, y = v)) +
geom_boxplot() +
facet_wrap(~ mainVar, nrow=2) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())
Base R
Because you cannot nest uses of par(mfrow=...) (they replace, not nest), you can stick with the over-arching 2x2 and handling the per-variable boxplots within boxplot. This can be with the wide data:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotData, plotData$mainVar, function(x) {
boxplot(x$varA, x$varB, x$varC, main=x$mainVar[1])
})
or the tall format:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotDataTall, plotDataTall$mainVar, function(x) {
boxplot(v~k, data=x, main=x$mainVar[1])
})
(I took the liberty of adjusting the margins, primarily for a shrunken combined plot here. Not required for production.)
As r2evans already points out, I doubt this is possible with the base plot function. Using ggplot2 (part of the tidyverse) you can get a one-page plot using:
library(tidyverse)
plotData %>%
gather(var, y, -mainVar) %>%
ggplot(aes(x = var, y = y)) + geom_boxplot() + facet_wrap(~mainVar)
note that this also uses the pipe operator (dplyr), and gather (tidyr) both part of tidyverse

Substituting dates with number of days in time series

I have following data on student scores on several pretests before their true exam.
a<-(c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b<-c(300,230,400,NA,NA,NA,"2013-04-30")
c<-c(NA,260,410,420,NA,NA,"2013-05-30")
d<-c(300,230,400,NA,370,390,"2013-08-30")
df<-as.data.frame(rbind(b,c,d))
colnames(df)<-a
rownames(df)<-(c("student 1","student 2","student 3"))
The actual datasheet is much larger. Since the dates vary so much, and the timing between the pretests and to the exam are relatively similar, I would rather convert the true dates into the number of days before the exam, so that they are the new column names, not dates. I understand that this will merge some of the columns which is OK. How would I be able to do that?
This is another good use case for reshape2, because you want to go to long form for plotting. For example:
# you are going to need the student id as a field
df$student_id <- row.names(df)
library('reshape2')
df2 <- melt(df, id.vars = c('student_id','actual_exam_date'),
variable.name = 'pretest_date',
value.name = 'pretest_score')
# drop empty observations
df2 <- df2[!is.na(df2$pretest_score),]
# these need to be dates
df2$actual_exam_date <- as.Date(df2$actual_exam_date)
df2$pretest_date <- as.Date(df2$pretest_date)
# date difference
df2$days_before_exam <- as.integer(df2$actual_exam_date - df2$pretest_date)
# scores need to be numeric
df2$pretest_score <- as.numeric(df2$pretest_score)
# now you can make some plots
library('ggplot2')
ggplot(df2, aes(x = days_before_exam, y = pretest_score, col=student_id) ) +
geom_line(lwd=1) + scale_x_reverse() +
geom_vline(xintercept = 0, linetype = 'dashed', lwd = 1) +
ggtitle('Pretest Performance') + xlab('Days Before Exam') + ylab('Pretest Score')
Here is one way to approach this one. I am sure there are many others. I commented the code to explain what is going on at each step:
# Load two libraries you need
library(tidyr)
library(dplyr)
# Construct data frame you provided
a <- (c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b <- c(300,230,400,NA,NA,NA,"2013-04-30")
c <- c(NA,260,410,420,NA,NA,"2013-05-30")
d <- c(300,230,400,NA,370,390,"2013-08-30")
df <- as.data.frame(rbind(b,c,d))
colnames(df) <- a
# Add student IDs as a column instead of row names and move them to first position
df$StudentID <- row.names(df)
row.names(df) <- NULL
df <- select(df, StudentID, everything())
# Gather date columns as 'categories' with score as the new column value
newdf <- df %>% gather(Date, Score, -actual_exam_date, -StudentID) %>% arrange(StudentID)
# Convert dates coded as factor variables into actual dates so we can do days to exam computation
newdf$actual_exam_date <- as.Date(as.character(newdf$actual_exam_date))
newdf$Date <- as.Date(as.character(newdf$Date))
# Create a new column of days before exam per student ID (group) and filter
# out dates with missing scores for each student
newdf <- newdf %>% group_by(StudentID) %>% mutate(daysBeforeExam = as.integer(difftime(actual_exam_date, Date, units = 'days'))) %>% filter(!is.na(Score))
# Plot the trends using ggplot
ggplot(newdf, aes(x = daysBeforeExam, y = Score, col = StudentID, group = StudentID)) + geom_line(size = 1) + geom_point(size = 2)

Resources