I have a problem with the order of the x-axis in my plot.
Download the data from the website of the Central Bank of PerĂº (GDP by quarter).
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
Use of json for download the data
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI)<- c("Quarter", "Millions")
As you see the plot is not in order in the x-axis.
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()
The x-axis is not in order because your character column becomes a factor and is sorted by alphanumeric ordering. So in your case "T1.19" would sort before "T2.18". To fix this, use library forcats and sort the factor based on the year. I split this out into its own column ("year") to make it clear what was being accomplished here.
library(jsonlite)
library(rstudioapi)
library(ggplot2)
library(data.table)
library(forcats)
PBI <- "PN02635BQ"
URL3 <- paste0("https://estadisticas.bcrp.gob.pe/estadisticas/series/api/",
PBI,"/json/2018-1-1/2021-7-31")
l_json <- jsonlite::fromJSON(URL3)
dt_PBI <- data.table(l_json$periods)
sapply(dt_PBI,class)
dt_PBI[,values := round(as.numeric(values),4)]
colnames(dt_PBI) <- c("Quarter", "Millions")
dt_PBI$year <- substr(dt_PBI$Quarter, 4, 5)
dt_PBI$Quarter <- fct_reorder(dt_PBI$Quarter, dt_PBI$year, min)
ggplot(dt_PBI, aes(x=Quarter, y=Millions)) +
geom_point()
Related
I am Asheesh from India. I am new to programming in data science. While completing a project I encountered an error "Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous" while running following code.
ggplot(data = cyclistic_data ) + geom_bar(aes(x = member_casual, y= ride_length))
I am also sharing my all code chunks if you may need a reference
install.packages("tidyverse")
install.packages("lubridate")
install.packages("dplyr")
install.packages("ggplot2")
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(readr)
setwd("C:\\Users\\sashe\\Desktop\\Data Analytics\\8_Capstone\\New folder")
data_2021_6 <- read.csv("202106-divvy-tripdata.csv")
data_2021_7 <- read.csv("202107-divvy-tripdata.csv")
data_2021_8 <- read.csv("202108-divvy-tripdata.csv")
data_2021_9 <- read.csv("202109-divvy-tripdata.csv")
data_2021_10 <- read.csv("202110-divvy-tripdata.csv")
data_2021_11 <- read.csv("202111-divvy-tripdata.csv")
data_2021_12 <- read.csv("202112-divvy-tripdata.csv")
data_2022_1 <- read.csv("202201-divvy-tripdata.csv")
data_2022_2 <- read.csv("202202-divvy-tripdata.csv")
data_2022_3 <- read.csv("202203-divvy-tripdata.csv")
data_2022_4 <- read.csv("202204-divvy-tripdata.csv")
data_2022_5 <- read.csv("202205-divvy-tripdata.csv")
combined_data<- rbind(data_2021_6, data_2021_7, data_2021_8, data_2021_9, data_2021_10, data_2021_11, data_2021_12, data_2022_1, data_2022_2, data_2022_3, data_2022_4, data_2022_5)
combined_data$ride_length <- difftime(combined_data$ended_at, combined_data$started_at)
#create columnds for: day of week, month, day, year, time
combined_data$date <- as.Date(combined_data$started_at) #default format is yyyy-mm-dd, use start date
combined_data$day_of_week <- wday(combined_data$started_at) #calculate the day of the week
combined_data$day_of_week <- format(as.Date(combined_data$date), "%A") #create column for day of week
combined_data$month <- format(as.Date(combined_data$date), "%m")#create column for month
combined_data$day <- format(as.Date(combined_data$date), "%d") #create column for day
combined_data$year <- format(as.Date(combined_data$date), "%Y") #create column for year
#clean the data
combined_data <- na.omit(combined_data) #remove rows with NA values
combined_data <- distinct(combined_data) #remove duplicate rows
combined_data <- combined_data[!(combined_data$ride_length <=0),] #remove where ride_length is 0 or negative
combined_data <- combined_data %>% #remove columns not needed: ride_id, start_station_id, end_station_id, start_lat, start_long, end_lat, end_lng
select(-c(ride_id, start_station_id, end_station_id,start_lat,start_lng,end_lat,end_lng))
cyclistic_data <- combined_data
ggplot(data = cyclistic_data ) + geom_bar(aes(x = member_casual, y= ride_length))
I would be grateful, if you may help me on this.
I am trying to create a plot in R using Geom_Segment. I am stuck with an error that says I need to input yend but I am inputting it already... this is my code:
library(ggplot2)
library(data.table)
library(magrittr)
dataset$From<-Sys.Date()
format(dataset$From, format="%Y-%m-%dT%H:%M:%OS")
dataset$To<-Sys.Date()
format(dataset$To, format="%Y-%m-%dT%H:%M:%OS")
ggplot(dataset, aes(x=datetime_start, y=dataset$Audit_Title,
color=dataset$Employee_Name)) +
geom_segment(aes(x=dataset$From,xend=dataset$To,y=dataset$Audit_Title,yend=dataset$Audit_Title),size=20)+
scale_colour_discrete(guide=guide_legend(override.aes=list(size=15))) +
ggtitle("Audit by Employee Timeline") + xlab("") + ylab("") + theme_bw()
SAMPLE DATA:
Here is the sample data
This is how I changed the code below to take in the data from Excel I inputted into Power BI:
library(ggplot2)
library(dplyr)
# transform into date
dataset <- dataset %>%
mutate_at(vars(dataset$From, dataset$To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
ggplot(dataset)+
geom_segment(aes(x=dataset$From, xend=dataset$To,
y=dataset$Employee_Name, yend=dataset$Employee_Name))
First of all, ideally you would share your data as a dput(dataset). If you can't share real data, you should make a minimal reproducible example and share that. See here
Here's your data
library(ggplot2)
library(dplyr)
df <-
read.table(
text =
"01/03/2020 03/16/2020 Supply_Chain John_Smith
05/08/2020 08/20/2020 Business_Unit Karen_Scott")
names(df) <- c("From", "To", "Audit_Title", "Employee_Name")
# transform into date
df <- df %>%
mutate_at(vars(From, To),
.funs = function(tt) readr::parse_date(as.character(tt),
format = "%m/%d/%Y"))
Now do the actual plot by selecting the proper x xend and having y be the employee (y=yend).
ggplot(df)+
geom_segment(aes(x=From, xend=To,
y=Employee_Name, yend=Employee_Name))
Which produces
If you want fancy colors, labels and stuff go ahead and check the proper documentation for ggplot. See here
I have this simple example of a boxplot:
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
ggplot(mytable, aes(x=age.class, y=date.numeric)) +
geom_boxplot()
My variable date.numeric is depicted as numbers in the plot, in which date number 1 represents date 1/1/2015 (reference date). How can I change the y-axis to show dates in format "month-year" instead of the numeric format?
try as.Date()
library(ggplot2)
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
mytable$date <- (as.Date(date.numeric,origin = "2015/1/1"))
ggplot(mytable, aes(x=age.class, y=date)) +
geom_boxplot()
Created on 2018-07-17 by the reprex package (v0.2.0.9000).
try creating a date offset variable and add that to your y-axis.
date.start <- as.Date('2015-01-01')
date.numeric <- c(98,105,110,120,75,35,200,167,365,425,400,398)
age.class <- c("juv","juv","juv","juv","juv","ad","ad","ad","ad","ad","ad","ad")
mytable <- data.frame(date.numeric,age.class)
ggplot(mytable, aes(x=age.class, y=date.numeric+date.start)) + geom_boxplot()
The axis would then look like Apr 2015, etc.
I have a data frame containing multiple numeric columns and one column with different factors. I'd like to produce a unique image containing the plots of the numeric columns, by factor. I tried the following:
varA <- runif(40)
varB <- runif(40)
varB <- runif(40)
varC <- runif(40)
mainVar <- c(rep('cat', 10), rep('dof', 10), rep('mouse', 10), rep('frog', 10))
plotData <- data.frame(varA, varB, varC, mainVar)
pdf('asd.pdf')
par(mfrow=c(2,2))
by(plotData, plotData$mainVar, function(x){
par(mfrow=c(1,3))
boxplot(x$varA)
boxplot(x$varB)
boxplot(x$varC)
})
dev.off()
It produces a unique pdf, but with a page for every factor.
Instead, I'd like to get something like that (without the red lines):
First, both techniques shown here prefer data in a "tall" format. There are several tools that will reshape it for you, I'll use
# library(tidyr)
plotDataTall <- tidyr::gather(plotData, k, v, -mainVar)
head(plotDataTall)
# mainVar k v
# 1 cat varA 0.4023846
# 2 cat varA 0.3406813
# 3 cat varA 0.7990530
# 4 cat varA 0.3706167
# 5 cat varA 0.5986029
# 6 cat varA 0.1626782
Other tools include the reshape2 package or the stats function reshape, both of which are increasingly less-intuitive to use for first time users.
ggplot2
library(ggplot2)
ggplot(plotDataTall, aes(x = k, y = v)) +
geom_boxplot() +
facet_wrap(~ mainVar, nrow=2) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())
Base R
Because you cannot nest uses of par(mfrow=...) (they replace, not nest), you can stick with the over-arching 2x2 and handling the per-variable boxplots within boxplot. This can be with the wide data:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotData, plotData$mainVar, function(x) {
boxplot(x$varA, x$varB, x$varC, main=x$mainVar[1])
})
or the tall format:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotDataTall, plotDataTall$mainVar, function(x) {
boxplot(v~k, data=x, main=x$mainVar[1])
})
(I took the liberty of adjusting the margins, primarily for a shrunken combined plot here. Not required for production.)
As r2evans already points out, I doubt this is possible with the base plot function. Using ggplot2 (part of the tidyverse) you can get a one-page plot using:
library(tidyverse)
plotData %>%
gather(var, y, -mainVar) %>%
ggplot(aes(x = var, y = y)) + geom_boxplot() + facet_wrap(~mainVar)
note that this also uses the pipe operator (dplyr), and gather (tidyr) both part of tidyverse
I have following data on student scores on several pretests before their true exam.
a<-(c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b<-c(300,230,400,NA,NA,NA,"2013-04-30")
c<-c(NA,260,410,420,NA,NA,"2013-05-30")
d<-c(300,230,400,NA,370,390,"2013-08-30")
df<-as.data.frame(rbind(b,c,d))
colnames(df)<-a
rownames(df)<-(c("student 1","student 2","student 3"))
The actual datasheet is much larger. Since the dates vary so much, and the timing between the pretests and to the exam are relatively similar, I would rather convert the true dates into the number of days before the exam, so that they are the new column names, not dates. I understand that this will merge some of the columns which is OK. How would I be able to do that?
This is another good use case for reshape2, because you want to go to long form for plotting. For example:
# you are going to need the student id as a field
df$student_id <- row.names(df)
library('reshape2')
df2 <- melt(df, id.vars = c('student_id','actual_exam_date'),
variable.name = 'pretest_date',
value.name = 'pretest_score')
# drop empty observations
df2 <- df2[!is.na(df2$pretest_score),]
# these need to be dates
df2$actual_exam_date <- as.Date(df2$actual_exam_date)
df2$pretest_date <- as.Date(df2$pretest_date)
# date difference
df2$days_before_exam <- as.integer(df2$actual_exam_date - df2$pretest_date)
# scores need to be numeric
df2$pretest_score <- as.numeric(df2$pretest_score)
# now you can make some plots
library('ggplot2')
ggplot(df2, aes(x = days_before_exam, y = pretest_score, col=student_id) ) +
geom_line(lwd=1) + scale_x_reverse() +
geom_vline(xintercept = 0, linetype = 'dashed', lwd = 1) +
ggtitle('Pretest Performance') + xlab('Days Before Exam') + ylab('Pretest Score')
Here is one way to approach this one. I am sure there are many others. I commented the code to explain what is going on at each step:
# Load two libraries you need
library(tidyr)
library(dplyr)
# Construct data frame you provided
a <- (c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b <- c(300,230,400,NA,NA,NA,"2013-04-30")
c <- c(NA,260,410,420,NA,NA,"2013-05-30")
d <- c(300,230,400,NA,370,390,"2013-08-30")
df <- as.data.frame(rbind(b,c,d))
colnames(df) <- a
# Add student IDs as a column instead of row names and move them to first position
df$StudentID <- row.names(df)
row.names(df) <- NULL
df <- select(df, StudentID, everything())
# Gather date columns as 'categories' with score as the new column value
newdf <- df %>% gather(Date, Score, -actual_exam_date, -StudentID) %>% arrange(StudentID)
# Convert dates coded as factor variables into actual dates so we can do days to exam computation
newdf$actual_exam_date <- as.Date(as.character(newdf$actual_exam_date))
newdf$Date <- as.Date(as.character(newdf$Date))
# Create a new column of days before exam per student ID (group) and filter
# out dates with missing scores for each student
newdf <- newdf %>% group_by(StudentID) %>% mutate(daysBeforeExam = as.integer(difftime(actual_exam_date, Date, units = 'days'))) %>% filter(!is.na(Score))
# Plot the trends using ggplot
ggplot(newdf, aes(x = daysBeforeExam, y = Score, col = StudentID, group = StudentID)) + geom_line(size = 1) + geom_point(size = 2)