I would like to create a plot in R, but I have the following problem: I would like to add date and time as a label to each point. Date and time are in two separated columns in my excel sheet. I have tried so far: geom_text(aes(label=time)). R gives me the correct time, but not the correct date. R adds the current date and not the date which is written in my excel sheet.
EDIT:
My data looks like this:
dput(test)
structure(list(date = c("01.08.2018", "01.08.2018", "02.08.2018", "02.08.2018", "03.08.2018", "03.08.2018"),
time = structure(c(1560943664, 1560943687, 1560943741, 1560946280, 1560946323, 1560946383),
class = c("POSIXct", "POSIXt"), tzone = ""),
north = c(6172449.577, 6172438.383, 6172438.596, 6172491.3, 6172492.683, 6172504.024),
east = c(222251.4534, 222251.0842, 222250.4152, 222250.7746, 222256.5543, 222252.3612),
number = c(1L, 1L, 2L, 2L, 3L, 3L)),
row.names = c(NA, -6L),
class = "data.frame")
This is my code:
options(stringsAsFactors = FALSE)
input2 <- "C:\\Users\\test.csv"
test<- read.csv(input2, sep=";")
test$time <- as.POSIXct(test$time, format = "%H:%M:%S")
library(ggplot2)
# dput(test)
plot <- ggplot(test, aes(x=east, y=north, size="9", group=number)) +
geom_point() + geom_line(linetype="dashed", size=1) +
geom_text(aes(label=time),hjust=0, vjust=1.5) +
theme(legend.position="none")
print(plot)
Using substr and paste you can create a new variable that has the date from the date variable and the time from the time variable, and then use this new variable for the ggplot label.
df <- structure(list(date = c("01.08.2018", "01.08.2018", "02.08.2018", "02.08.2018", "03.08.2018", "03.08.2018"), time = structure(c(1560943664, 1560943687, 1560943741, 1560946280, 1560946323, 1560946383), class = c("POSIXct", "POSIXt"), tzone = ""), north = c(6172449.577, 6172438.383, 6172438.596, 6172491.3, 6172492.683, 6172504.024), east = c(222251.4534, 222251.0842, 222250.4152, 222250.7746, 222256.5543, 222252.3612), number = c(1L, 1L, 2L, 2L, 3L, 3L)), row.names = c(NA, -6L), class = "data.frame")
df <- df %>%
mutate(date = as.Date(date, "%m.%d.%Y")) %>%
mutate(t = substr(time, 12,19)) %>%
mutate(dt = paste(date,t, sep = " "))
ggplot(df, aes(x=east, y=north, size="9", group=number)) +
geom_point() +
geom_line(linetype="dashed", size=1) +
geom_text(aes(label=dt),hjust=0, vjust=1.5) +
theme(legend.position="none")
Related
I want to plot multiple time series on one plot. Currently I can plot them all individually but not together. How can I join the data because the years are split by decimals.
What I basically want to end up with is this Plotting multiple time-series in ggplot (See plot in the first answer)
library(tidyverse)
library(plyr)
theme_set(theme_bw(10))
Sydney1<-read.csv("Sydney1.csv",header=TRUE)
Sydney2<-read.csv("Sydney2.csv",header=TRUE)
Eden<-read.csv("Eden.csv",header=TRUE)
StonyBay<-read.csv("Stonybay.csv",header=TRUE)
LowHead<-read.csv("Lowhead.csv",header=TRUE)
Hobart<-read.csv("Hobart.csv",header=TRUE)
Devonport<-read.csv("Devonport.csv",header=TRUE)
Freemantle<-read.csv("Freemantle.csv",header=TRUE)
ggplot(Sydney1,aes(x=Year,y=SLR))+geom_line(aes(color="Sydney1"))
ggplot(Sydney2,aes(x=Year,y=SLR))+geom_line(aes(color="Sydney2"))
ggplot(Eden,aes(x=Year,y=SLR))+geom_line(aes(color="Eden"))
ggplot(StonyBay,aes(x=Year,y=SLR))+geom_line(aes(color="StonyBay"))
ggplot(LowHead,aes(x=Year,y=SLR))+geom_line(aes(color="Lowhead"))
ggplot(Hobart,aes(x=Year,y=SLR))+geom_line(aes(color="Hobart"))
ggplot(Devonport,aes(x=Year,y=SLR))+geom_line(aes(color="Devonport"))
ggplot(Freemantle,aes(x=Year,y=SLR))+geom_line(aes(color="Freemantle"))
#Sydney 1
structure(list(Year = c(1886.0417, 1886.125, 1886.2083, 1886.2917,
1886.375, 1886.4583), SLR = c(6819L, 6942L, 6980L, 6958L, 7015L,
6892L)), row.names = c(NA, 6L), class = "data.frame")
#Sydney 2
structure(list(Year = c(1914.4583, 1914.5417, 1914.625, 1914.7083,
1914.7917, 1914.875), SLR = c(7022L, 6963L, 6915L, 6924L, 6866L,
6956L)), row.names = c(NA, 6L), class = "data.frame")
#Eden
structure(list(Year = c(1986.7917, 1986.875, 1986.9583, 1987.0417,
1987.125, 1987.2083), SLR = c(7003L, 6942L, 6969L, 7067L, NA,
7015L)), row.names = c(NA, 6L), class = "data.frame")
#Stony Bay
structure(list(Year = c(1993.0417, 1993.125, 1993.2083, 1993.2917,
1993.375, 1993.4583), SLR = c(6826L, 6868L, 6796L, 6862L, 6893L,
6951L)), row.names = c(NA, 6L), class = "data.frame")
#Low head
structure(list(Year = c(2010.125, 2010.2083, 2010.2917, 2010.375,
2010.4583, 2010.5417), SLR = c(6971L, 6968L, 7030L, 7088L, 7063L,
7035L)), row.names = c(NA, 6L), class = "data.frame")
#Hobart
structure(list(Year = c(1987.875, 1987.9583, 1988.0417, 1988.125,
1988.2083, 1988.2917), SLR = c(6916L, 6870L, 6930L, 6870L, 6820L,
6817L)), row.names = c(NA, 6L), class = "data.frame")
#Devonport
structure(list(Year = c(1989.875, 1989.9583, 1990.0417, 1990.125,
1990.2083, 1990.2917), SLR = c(6976L, 7025L, 7030L, 7046L, 6999L,
7055L)), row.names = c(NA, 6L), class = "data.frame")
#Freemantle
structure(list(Year = c(1897.0417, 1897.125, 1897.2083, 1897.2917,
1897.375, 1897.4583), SLR = c(6542L, 6524L, 6557L, 6655L, 6648L,
6729L)), row.names = c(NA, 6L), class = "data.frame")
Using the data in the Note at the end first accumulate the series in a list L -- we assume any data frame having column names Year and SLR is to be added -- and then convert that to a single zoo object and plot it using autoplot.zoo which uses ggplot2. Remove the facet = NULL argument if you want them plotted in separate facets.
library(ggplot2)
library(zoo)
is_city_df <- function(x) is.data.frame(x) && identical(names(x), c("Year", "SLR"))
L <- Filter(is_city_df, mget(ls()))
z <- do.call("merge", lapply(L, read.zoo))
autoplot(z, facet = NULL)
Note
We assume the following inputs:
Sydney1 <-
structure(list(Year = c(1886.0417, 1886.125, 1886.2083, 1886.2917,
1886.375, 1886.4583), SLR = c(6819L, 6942L, 6980L, 6958L, 7015L,
6892L)), row.names = c(NA, 6L), class = "data.frame")
Sydney2 <-
structure(list(Year = c(1914.4583, 1914.5417, 1914.625, 1914.7083,
1914.7917, 1914.875), SLR = c(7022L, 6963L, 6915L, 6924L, 6866L,
6956L)), row.names = c(NA, 6L), class = "data.frame")
Eden <-
structure(list(Year = c(1986.7917, 1986.875, 1986.9583, 1987.0417,
1987.125, 1987.2083), SLR = c(7003L, 6942L, 6969L, 7067L, NA,
7015L)), row.names = c(NA, 6L), class = "data.frame")
Freemantle <-
structure(list(Year = c(1897.0417, 1897.125, 1897.2083, 1897.2917,
1897.375, 1897.4583), SLR = c(6542L, 6524L, 6557L, 6655L, 6648L,
6729L)), row.names = c(NA, 6L), class = "data.frame")
Assuming that your global environment only contains the dataframes you want to plot:
bind_rows(mget(ls()), .id = "City") %>%
ggplot(aes(x = Year, y = SLR, color = City)) +
geom_line()
Instead of creating multiple objects in the global env, it can be read all at once in a loop
library(dplyr)
library(purrr)
library(ggplot2)
library(readr)
files <- c("Sydney1.csv", "Sydney2.csv", "Eden.csv", "Stonybay.csv",
"Lowhead.csv", "Hobart.csv", "Devonport.csv", "Freemantle.csv")
map_dfr(files, ~ read_csv(.x) %>%
mutate(City = .x)) %>%
ggplot(aes(x = Year, y = SLR, color = City)) +
geom_line())
Try this:
Sydney1 %>% mutate(city = 'sydney1') %>%
bind_rows(Sydney2 %>% mutate(city = 'sydney2')) %>%
bind_rows(Eden %>% mutate(city = 'eden')) %>%
bind_rows(Freemantle %>% mutate(city = 'freemantle')) %>%
ggplot(aes(x=Year, y=SLR, color=city)) + geom_line() + facet_wrap(~city, scale='free')
Say , i have such dataset.
mydat=structure(list(Date = structure(c(3L, 2L, 1L), .Label = c("2019-09-23 07-AM",
"2019-09-23 08-AM", "2019-09-23 09-AM"), class = "factor"), Symbol = structure(c(1L,
1L, 1L), .Label = "BTCUSD", class = "factor"), Open = c(8065.1,
8127.07, 8082.32), High = c(8085.04, 8137.61, 8135.93), Low = c(8065.09,
8058.95, 8022.19), Close = c(8085.04, 8065.1, 8127.07), Volume = c(14.286264,
22.74164, 42.751659)), .Names = c("Date", "Symbol", "Open", "High",
"Low", "Close", "Volume"), class = "data.frame", row.names = c(NA,
-3L))
the Date var must be timestamp, i.e. tranformed data
Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
1325317920,4.39,4.39,4.39,4.39,0.45558087,2.0000000193,4.39
1325317980,NaN,NaN,NaN,NaN,NaN,NaN,NaN
1325318040,NaN,NaN,NaN,NaN,NaN,NaN,NaN
1325318100,NaN,NaN,NaN,NaN,NaN,NaN,NaN
1325318160,NaN,NaN,NaN,NaN,NaN,NaN,NaN
1325318220,NaN,NaN,NaN,NaN,NaN,NaN,NaN
here must be addtional variable - Weighted_Price
to calculate it i should use simple formula [(Hgh + Low + Close) / 3] manually
How to get desired format in R?
If I follow your description, you want to convert Date to timestamp and calculate mean of 3 values. Using base R, that would be
transform(mydat, timestamp = as.integer(as.POSIXct(Date, format = "%Y-%m-%d %I-%p",
tz = "UTC")), weighted_avg = (High + Low + Close)/3)
Using dplyr and lubridate, we can do
library(dplyr)
library(lubridate)
mydat %>%
mutate(timestamp = as.integer(ymd_h(Date)),
weighted_avg = (High + Low + Close) / 3)
We can use anytime from anytime
library(anytime)
library(dplyr)
mydat %>%
mutate(timestamp - as.integer(anytime(Date)),
weighted_avg = rowMeans(.[c('High', "Low", "Close")]))
Or with base R
transform(mydat, timestamp = as.integer(strptime(Date, format = "%Y-%m-%d %I-%p", tz = "UTC")), weighted_avg = rowMeans(mydat[c("High", "Low", "Close")]))
I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.
We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))
I have a dataframe with dates. Here are the first 3 rows with dput:
df.cv <- structure(list(ds = structure(c(1448064000, 1448150400, 1448236800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), y = c(10.4885204292416,
10.456538985014, 10.4264986311659), yhat = c(10.4851491194439,
10.282089547027, 10.4354960430083), yhat_lower = c(10.4169914076864,
10.2162549984153, 10.368531352493), yhat_upper = c(10.5506038959764,
10.3556867861042, 10.5093092789713), cutoff = structure(c(1447977600,
1447977600, 1447977600), class = c("POSIXct", "POSIXt"), tzone = "UTC")),.Names = c("ds",
"y", "yhat", "yhat_lower", "yhat_upper", "cutoff"), row.names = c(NA,
-3L), class = c("`enter code here`tbl_df", "tbl", "data.frame"))
I'm trying to plot the data with ggplot + geom_line from similar day/month combinations in one plot. So, for example, I want the y-value of 2016-01-01 to appear on the same x-value as 2017-01-01. If found a way to do this, but it seems to be a very complex workaround:
library(tidyverse)
library(lubridate)
p <- df.cv %>%
mutate(jaar = as.factor(year(ds))) %>%
mutate(x = as_date(as.POSIXct(
ifelse(jaar==2016, ds + years(1), ds),
origin = "1970-01-01")))
ggplot(p %>% filter(jaar!=2015), aes(x=x, group=jaar, color=jaar)) +
geom_line(aes(y=y))
It works, but as you can see I first have to extract the year, then use an ifelse to add one year to only the 2016 dates, convert with POSIXct because ifelse strips the class, convert back into POSIXct while supplying an origin, and finally remove the timestamp with as_date.
Isn't there a simpler, more elegant way to do this?
Use year<- to replace the year with any fixed leap year:
p <- df.cv %>%
mutate(jaar = as.factor(year(ds)),
x = `year<-`(as_date(ds), 2000))
ggplot(p, aes(x = x, y = y, color = jaar)) +
geom_line()
I am trying to improve the memory performance for the following example:
basline df with 4 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L), .Label =
c("AAA1", "AAA2","AAA3", "AAA4"), class = "factor"), bitrateinbps = c(10000000,
10000000, 10000000, 10000000), startdate = structure(c(1326758507, 1326758671,
1326759569, 1326760589), class = c("POSIXct", "POSIXt"), tzone = ""), enddate =
structure(c(1326765780, 1326758734, 1326760629, 1326761592), class = c("POSIXct",
"POSIXt"), tzone = "")), .Names = c("sessionid", "bitrateinbps", "startdate",
"enddate"), row.names = c(NA, 4L), class =
"data.frame")
alternate df with 8 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
.Label = c("AAA1", "AAA2", "AAA3", "AAA4", "AAA5", "AAA6", "AAA7", "AAA8"),
class = "factor"), bitrateinbps =c(10000000, 10000000, 10000000, 10000000,
10000000, 10000000, 10000000, 10000000), startdate = structure(c(1326758507,
1326758671, 1326759569, 1326760589, 1326761589, 1326762589, 1326763589, 1326764589),
class = c("POSIXct",
"POSIXt"), tzone = ""), enddate = structure(c(1326765780, 1326758734, 1326760629,
1326761592, 1326767592,
1326768592, 1326768700, 1326769592), class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("sessionid",
"bitrateinbps", "startdate", "enddate"), row.names = c(NA, 8L), class =
"data.frame")
try df analysis memory usage and again for alternate df
library(xts)
fun0 <- function(i, d) {
idx0 <- seq(d$startdate[i],d$enddate[i],1) # create sequence for index
dat0 <- rep(1,length(idx0)) # create data over sequence
xts(dat0, idx0, dimnames=list(NULL,d$sessionid[i])) # xts object
}
# loop over each row and put each row into its own xts object
xl0 <- lapply(1:NROW(df), fun0, d=df)
# merge all the xts objects
xx0 <- do.call(merge, xl0)
# apply a function (e.g. colMeans) to each 15-minute period
xa0 <- period.apply(xx0, endpoints(xx0, 'minutes', 15), colSums, na.rm=TRUE)/900
xa1 <- t(xa0)
# convert from atomic vector to data frame
xa1 = as.data.frame(xa1)
# bind to df
out1 = cbind(df, xa1)
# print aggregate memory usage statistics
print(paste('R is using', memory.size(), 'MB out of limit', memory.limit(), 'MB'))
# create function to return matrix of memory consumption
object.sizes <- function()
{
return(rev(sort(sapply(ls(envir=.GlobalEnv), function (object.name)
object.size(get(object.name))))))
}
# print to console in table format
object.sizes()
results as follows:
4 row df:
xx0 = 292104 Bytes .... do.call(merge, xl0)
xl0 = 154648 Bytes .... lapply(1:NROW(df), fun0, d=df)
8 row df:
xx0 = 799480 Bytes .... do.call(merge, xl0)
xl0 = 512808 Bytes .... lapply(1:NROW(df), fun0, d=df)
I'm looking for something a little more memory efficient for the merge and lapply functions, so I can scale out the number of rows, if anyone has any suggestions and can show the comparative results for alternatives.