I am trying to follow this tutorial but I am encountering an error. I have my data formatted exactly like it is in the example, at least as far as I can tell. But when I try to run the calculation for the MCP below:
#Space use by week, month, or year - MCP
> mcps.year<-trk %>% nest(-id,-year) %>%
+ mutate(mcparea = map(data, ~hr_mcp(., levels = c(0.95)) %>% hr_area)) %>%
+ select(id, year, mcparea) %>% unnest()
I receive this error:
Error: Problem with `mutate()` column `mcparea`.
i `mcparea = map(data, ~hr_mcp(., levels = c(0.95)) %>% hr_area)`.
x is.numeric(crs) || is.character(crs) || inherits(crs, "crs") is not TRUE
Run `rlang::last_error()` to see where the error occurred.
In addition: Warning message:
All elements of `...` must be named.
Did you want `data = c(x_, y_, t_, tod_, week, month, hour, sl, nsd_)`?
How can I fix this? I tried setting x_ and y_ as coordinates and then setting the projection to make it a spatial points dataframe:
trk <- st_as_sf(trk, coords = c("x_", "y_"))
st_crs(trk) <- 32614
But then I got this error:
Error: Problem with `mutate()` column `mcparea`.
i `mcparea = map(data, ~hr_mcp(., levels = c(0.95)) %>% hr_area)`.
x Can't subset columns that don't exist.
x Columns `x_` and `y_` don't exist.
Run `rlang::last_error()` to see where the error occurred.
In addition: Warning message:
All elements of `...` must be named.
Did you want `data = c(t_, tod_, week, month, hour, sl, nsd_, geometry)`?
EDIT:
Ok here is the code Im running!
#Set seed for random number generator
set.seed(10299)
#Import data
setwd("C:/Users/saraa/OneDrive - UNT System/Masters Thesis/Data/Testing RSF")
turtle <- read_csv("West2021KDE.csv")
turtle.dat <- as(turtle, "data.frame")
#Data cleaning
#Remove incomplete observations
ind<-complete.cases(turtle.dat[,c("y", "x", "date", "CST")])
turtle.dat<-turtle.dat[ind==TRUE,]
#Check for and remove duplicates
ind2<-turtle.dat %>% select(y, x, CST, date, id) %>% duplicated
sum(ind2)
turtle.dat<-turtle.dat[ind2!=TRUE,]
#Make timestamp a date/time variable
turtle.dat$timestamp <- with(turtle.dat, mdy(date) + hms(CST))
turtle.dat$timestamp<-as.POSIXct(turtle.dat$timestamp, format="%Y-%m-%d %H:%M")
#Plot data
#Separately
ggplot(turtle.dat, aes(x=x, y=y))+geom_point()+
facet_wrap(~id, scales="free")
#Combined
ggplot(turtle.dat, aes(x=x, y=y, color=as.factor(id)))+
geom_point()
#Creating a track using amt
trk <- mk_track(turtle.dat, .x=x, .y=y, .t=timestamp, id=id,
crs = CRS("+init=epsg:4326"))
#Now it is easy to calculate day/night with either movement track
trk <- trk %>% time_of_day()
#Transform coordinates
trk <- transform_coords(trk,sp::CRS("+init=epsg:32614"))
trk.class<-class(trk)
#Nest tracks by individual
nesttrk<-trk%>%nest(-id)
nesttrk
#Make new columns of time of observation
trk <- trk %>%
mutate(week = week(t_),
month = month(t_, label=TRUE),
year=year(t_),
hour = hour(t_))
#Calculate movement distances
trk <- trk %>% nest(-id) %>%
mutate(sl = map(data, step_lengths),
nsd_=map(data, nsd)) %>% unnest(cols = c(data, sl, nsd_))
#Transform to track object
class(trk)
class(trk)<-trk.class
trk
#Net squared displacement over time
ggplot(trk, aes(x = t_, y=nsd_)) + geom_point()+
facet_wrap(~id, scales="free")
#Step length distribution
ggplot(trk, aes(x = month, y = log(sl))) +
geom_boxplot()+geom_smooth()+facet_wrap(~id)
#Space use by week, month, or year - MCP
mcps.year<-trk %>% nest(-id,-year) %>%
mutate(mcparea = map(data, ~hr_mcp(., levels = c(0.95)) %>% hr_area)) %>%
select(id, year, mcparea) %>% unnest()
ggplot(mcps.year, aes(x = year, y = area, colour=as.factor(year))) + geom_point()+
geom_smooth()+ facet_wrap(~id, scales="free")
and str(trk)
> str(trk)
track_xyt [1,058 x 13] (S3: track_xyt/track_xy/tbl_df/tbl/data.frame)
$ id : chr [1:1058] "M13" "M13" "M13" "M13" ...
$ x_ : Named num [1:1058] 687578 687491 687455 687566 687518 ...
..- attr(*, "names")= chr [1:1058] "1" "3" "122" "128" ...
$ y_ : Named num [1:1058] 3659888 3659889 3659893 3659891 3659959 ...
..- attr(*, "names")= chr [1:1058] "1" "3" "122" "128" ...
$ t_ : POSIXct[1:1058], format: "2021-04-03 15:19:00" "2021-04-18 11:19:00" "2021-05-02 12:43:00" "2021-06-03 11:23:00" ...
$ tod_ : Factor w/ 2 levels "day","night": 1 2 1 1 2 2 2 2 2 2 ...
$ dir_abs: num [1:1058] 1.558 1.485 4.699 0.616 5.525 ...
$ dir_rel: num [1:1058] NA -0.073 -3.07 2.2 -1.374 ...
$ sl : num [1:1058] 87.2 36.1 111.6 83.8 40.5 ...
$ nsd_ : Named num [1:1058] 0 7600 15171 140 8642 ...
..- attr(*, "names")= chr [1:1058] "1" "3" "122" "128" ...
$ week : num [1:1058] 14 16 18 22 23 25 26 27 30 30 ...
$ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 5 6 6 6 7 7 7 7 ...
$ year : num [1:1058] 2021 2021 2021 2021 2021 ...
$ hour : int [1:1058] 15 11 12 11 10 9 9 9 10 0 ...
Related
I'm scaling one column in a dataset with the intention of fitting a linear model. However, when I try to write the dataframe (with scaled column) to a csv, it doesn't work because the scaled column became complex with center and scale attributes.
Can someone please indicate how to convert the scaled column to something that can write to a csv? (and maybe why scale() needs to do it this way.)
# make a data frame
testDF <- data.frame(x1 = c(1,2,2,3,2,4,4,5,6,15,36,42,11,12,23,24,25,66,77,18,9),
x2 = c(1,4,5,9,4,15,17,25,35,200,1297,1764,120,150,500,500,640,4200,6000,365,78))
# scale the x1 attribute
testDF <- testDF %>%
mutate(x1_scaled = scale(x1, center = TRUE, scale = TRUE))
# write to csv doesn't work
write_csv(as.matrix(testDF), "testDF.csv")
# but plotting and lm do work
ggplot(testDF, aes(x1_scaled)) +
geom_histogram(aes(y = ..density..),binwidth = 1)
Lm_scaled <- lm(x2 ~ x1_scaled, data = testDF)
plot(Lm_scaled)
scale returns a matrix output. We could extract the column or use as.vector to remove the dim attribute
testDF <- testDF %>%
mutate(x1_scaled = as.vector(scale(x1, center = TRUE, scale = TRUE)))
Check the structure of the output without as.vector and with as.vector
> testDF %>%
+ mutate(x1_scaled = scale(x1, center = TRUE, scale = TRUE)) %>% str
'data.frame': 21 obs. of 3 variables:
$ x1 : num 1 2 2 3 2 4 4 5 6 15 ...
$ x2 : num 1 4 5 9 4 15 17 25 35 200 ...
$ x1_scaled: num [1:21, 1] -0.824 -0.776 -0.776 -0.729 -0.776 ...
..- attr(*, "scaled:center")= num 18.4
..- attr(*, "scaled:scale")= num 21.2
> testDF %>%
+ mutate(x1_scaled = as.vector(scale(x1, center = TRUE, scale = TRUE))) %>% str
'data.frame': 21 obs. of 3 variables:
$ x1 : num 1 2 2 3 2 4 4 5 6 15 ...
$ x2 : num 1 4 5 9 4 15 17 25 35 200 ...
$ x1_scaled: num -0.824 -0.776 -0.776 -0.729 -0.776 ...
You can simply convert the scale column to numeric in base R and write out the dataframe:
testDF$x1_scaled <- as.numeric(testDF$x1_scaled)
write_csv(testDF, "testDF.csv")
I have a data set that has missing data from about July 7th to July 19th. Graph of my dataset. You can see the data gap pretty easily. I would like to truncate it so that the gap isnt there and the before and after data butt up against each other. Something like this . I did try to follow the linked example but I dont understand how they set up xseq. I also tried just removing the offending dates and creating a dataframe without them but that didnt solve the problem.
Im not sure if its helpful but here is the existing code for the graph:
together <- ggplot() +
stat_summary(data = grid_pad, aes(x = DTT, y = grid_value, fill = 'Ambient'), geom='ribbon', fun.data = mean_cl_quantile, alpha = 0.25) +
stat_summary(data = grid_pad, aes(x = DTT, y = grid_value, color = 'Ambient'), geom='line', fun = mean, size = 0.9) +
stat_summary(data = turtle_pad, aes(x = DTT, y = turtle_value, fill = 'Turtle'), geom='ribbon', fun.data = mean_cl_quantile, alpha = 0.25) +
stat_summary(data = turtle_pad, aes(x = DTT, y = turtle_value, color = 'Turtle'), geom='line', fun = mean, size = 0.9) +
labs(x = "Date", y = "Temperature")+
scale_color_manual("Legend", values = c('Ambient' = '#1b9e77', 'Turtle' = '#d95f02'), labels = c(Ambient = 'Ambient Temp', Turtle = 'Turtle Temp')) +
scale_fill_manual("Legend", values = c('Ambient' = '#1b9e77', 'Turtle' = '#d95f02'), labels = c(Ambient = 'Ambient Temp', Turtle = 'Turtle Temp')) +
theme_classic() +
ggtitle("Ambient and Turtle Temperatures")+
ggeasy::easy_center_title()+
easy_remove_legend_title()
together
and here is the structure of my data:
> str(grid_pad)
grouped_df [142,800 x 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ Logger : Factor w/ 50 levels "TL1","TL11","TL12",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DTT : POSIXct[1:142800], format: "2021-05-28 00:00:00" "2021-05-28 01:00:00" "2021-05-28 02:00:00" "2021-05-28 03:00:00" ...
$ grid_value: num [1:142800] NA NA NA NA NA 19.5 19.5 19.5 20 22 ...
- attr(*, "groups")= tibble [50 x 2] (S3: tbl_df/tbl/data.frame)
..$ Logger: Factor w/ 50 levels "TL1","TL11","TL12",..: 1 2 3 4 5 6 7 8 9 10 ...
> str(turtle_pad)
grouped_df [57,120 x 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ Name : Factor w/ 20 levels "F1","F11","F12",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DTT : POSIXct[1:57120], format: "2021-05-28 00:00:00" "2021-05-28 01:00:00" "2021-05-28 02:00:00" "2021-05-28 03:00:00" ...
$ turtle_value: num [1:57120] NA NA NA NA NA NA NA NA NA NA ...
- attr(*, "groups")= tibble [20 x 2] (S3: tbl_df/tbl/data.frame)
..$ Name : Factor w/ 20 levels "F1","F11","F12",..: 1 2 3 4 5 6 7 8 9 10 ...
with base R, verbose:
df_with_gap <- data.frame(Name = gl(41, 1),
DTT = as.Date(Sys.Date()) + (-20:20),
turtle_value = c(runif(20), rep(NA, 5), runif(16))
)
rows_to_keep <- !is.na(df_with_gap$turtle_value)
## remove NAs
df_without_gap <- df_with_gap[rows_to_keep,]
## create some index to use for x-values ggplot
df_without_gap$pseudo_date <- rownames(df)
Please note:
while you could use DTT of the remaining values to label your axis (see label argument in ?scale_x_continuous`, the plot will be misleading as it covers up missing information)
a scatter plot would be the way to go if you want to show the association between ambient and turtle temperature.
to show seasonality of instead, consider adding a smoother (?geom_smooth for ggplot)
to convey variability, a boxplot might be more instructive
helpful chart pickers on the web
I'm trying to run different forecast modeling methods on a monthly tsibble dataset. Its head() looks like:
# A tsibble: 6 x 2 [1M]
month total
<mth> <dbl>
1 2000 Jan 104.
2 2000 Feb 618.
3 2000 Mar 1005.
4 2000 Apr 523.
5 2000 May 1908.
6 2000 Jun 1062.
and has a structure of:
tsibble [212 x 2] (S3: tbl_ts/tbl_df/tbl/data.frame)
$ month: mth [1:212] 2000 Jan, 2000 Feb, 2000 Mar, 2000 Apr, 2000 May, 2000 Jun, 2000 Jul, 2000 Aug, 2000 Sep, 2000 Oct, 2000 Nov...
$ total: num [1:212] 104 618 1005 523 1908 ...
- attr(*, "key")= tibble [1 x 1] (S3: tbl_df/tbl/data.frame)
..$ .rows: list<int> [1:1]
.. ..$ : int [1:212] 1 2 3 4 5 6 7 8 9 10 ...
.. ..# ptype: int(0)
- attr(*, "index")= chr "month"
..- attr(*, "ordered")= logi TRUE
- attr(*, "index2")= chr "month"
- attr(*, "interval")= interval [1:1] 1M
..# .regular: logi TRUE
The dataset is monthly from 2000/01 to 2017/08 with no missing values or time periods. I'm trying to run a model such as:
df %>%
model(STL(total ~ season(window=9),robust=T)) %>%
components() %>% autoplot()
fit <- df %>%
model(ANN =ETS(total ~ error("A") + trend("A") + season()))
But for any type of model I try to run I get the exact same error each time. I'm looking for suggestions to correct the structure of the tsibble to allow these model functions to work.
Error in UseMethod("model") :
no applicable method for 'model' applied to an object of class "c('tbl_ts', 'tbl_df', 'tbl', 'data.frame')"
EDIT: Including reproducible example:
a = c(sample(1:1000,212))
df.ts <- ts(a, start=c(2000,1),end=c(2017,8),frequency=12)
df <- df.ts %>% as_tsibble()
Thanks for the example, I was able to get it to run without any errors, as follows:
library(tidyverse)
library(fpp3)
a = c(sample(1:1000,212))
df.ts <- ts(a, start=c(2000,1),end=c(2017,8),frequency=12)
df <- df.ts %>% as_tsibble()
df %>%
model(STL(a ~ season(window=9),robust=T)) %>%
components() %>% autoplot()
fit <- df %>%
model(ANN =ETS(a ~ error("A") + trend("A") + season()))
report(fit)
Here is what the decomposition looks like:
Here is the report of the model:
As both Russ Conte and Rob Hyndman found there's nothing inherently wrong with the example code being used.
I believe there was an overlapping issue between two packages, as my issue was resolved upon removing and reinstalling the forecasting packages.
I am using a dataset from tidytuesday and trying to order values within each factor .
For example in below plot I want the values of Years to be in ascending order for each city (Years to start from 2012 to 2021).
How can I order them ? Is there any fct_() to do order within ?
df summary:
str(transit_cost)
output:
tibble [537 x 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ e : num [1:537] 7136 7137 7138 7139 7144 ...
$ country : Factor w/ 56 levels "Argentina","Australia",..: 9 9 9 9 9 30 9 54 54 54 ...
$ city : Factor w/ 140 levels "Ad Dammam","Ahmadabad",..: 131 128 128 128 128 3 80 107 69 69 ...
$ line : Factor w/ 366 levels "1995-98 program",..: 15 354 321 286 361 282 14 342 306 305 ...
$ start_year : num [1:537] 2020 2009 2020 2020 2020 ...
$ end_year : num [1:537] 2025 2017 2030 2030 2030 ...
$ rr : chr [1:537] "Not Railroad" "Not Railroad" "Not Railroad" "Not Railroad" ...
$ length : num [1:537] 5.7 8.6 7.8 15.5 7.4 9.7 5.8 5.1 4.2 4.2 ...
$ tunnel_per : num [1:537] 0.877 1 1 0.568 1 ...
$ tunnel : num [1:537] 5 8.6 7.8 8.8 7.4 7.1 5.8 5.1 4.2 4.2 ...
$ stations : num [1:537] 6 6 3 15 6 8 5 2 2 2 ...
$ source1 : chr [1:537] "Plan" "Media" "Wiki" "Plan" ...
$ cost : num [1:537] 2830 3200 5500 8573 5600 ...
$ currency : chr [1:537] "CAD" "CAD" "CAD" "CAD" ...
$ year : num [1:537] 2018 2013 2018 2019 2020 ...
$ ppp_rate : num [1:537] 0.84 0.81 0.84 0.84 0.84 1.3 0.84 1 1 1 ...
$ real_cost : num [1:537] 2377 2592 4620 7201 4704 ...
$ cost_km_millions: num [1:537] 417 301 592 465 636 ...
$ source2 : chr [1:537] "Media" "Media" "Media" "Plan" ...
$ reference : chr [1:537] "https://www.translink.ca/Plans-and-Projects/Rapid-Transit-Projects/Broadway-Subway-Project.aspx" "https://www.thestar.com/news/gta/transportation/2017/12/15/trudeau-wynne-tory-on-hand-to-cut-ribbon-on-32-billi"| __truncated__
$ country_code : chr [1:537] "CA" "CA" "CA" "CA" ...
Code:
library(tidyverse)
library(tidytuesdayR)
library(scales)
library(glue)
library(countrycode)
tt <- tidytuesdayR::tt_load("2021-01-05")
transit_cost <- tt$transit_cost %>%
mutate_at(vars(country,city,line), as.factor) %>%
mutate_at(vars(start_year,end_year, real_cost), as.numeric)
transit_cost <- transit_cost %>%
filter(!is.na(e)) %>%
mutate(country = as.character(country),
# if you don't convert to "char" above then due to factors it will return NA in country
country_code = ifelse(country == "UK", "GB", country),
country = countrycode(country_code, "iso2c", "country.name"),
country = as.factor(country),
tunnel_per = tunnel / length,
rr = ifelse(rr, "Railroad", "Not Railroad"))
transit_cost
Plot code:
transit_cost %>%
filter(country == "India") %>%
mutate(city = fct_reorder(city, real_cost, sum)) %>%
ggplot(aes(x = real_cost, y = city, fill = year, group = as.factor(year))) +
geom_col() +
scale_x_continuous(label = scales::comma_format()) +
labs(title = "Total real cost of Projects across Indian cities",
subtitle = "color based on Year of Project Lines")
I also tried:
transit_cost %>%
filter(country == "India") %>%
mutate(city = fct_reorder(city, real_cost, sum)) %>%
# added this to order them
group_by(as.factor(year)) %>%
arrange(desc(year)) %>%
ggplot(aes(x = real_cost, y = city, fill = year, group = as.factor(year))) +
geom_col() +
scale_x_continuous(label = scales::comma_format()) +
labs(title = "Total real cost of Projects across Indian cities",
subtitle = "color based on years of Project Lines")
Try this:
transit_cost %>%
filter(country == "India") %>%
mutate(city = fct_reorder(city, real_cost, sum)) %>%
ggplot() +
geom_col(aes(x = real_cost, y = city, group = -year, fill = year)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title = "Total real cost of Projects across Indian cities",
subtitle = "color based on Year of Project Lines")
The only changes I've made are group = -year and removed factor in fill = year
I have a list of variable named theta and want to draw n number of random variable from using each theta.
S = 5
n = 5
test = tibble(
s = 1:S,
theta = rgamma(S, shape = 10*s, rate = 50)
) %>%
mutate(data = rexp(n, theta))
ideally, I want my result be something like this:
S theta data
1 some value [a list with n number]
...
then expand it into a tibble:
S theta d1 d2 d3 .. dn
...
Hope this is clear.
Thanks.
If we need a list then, we can use map to loop through the each element of 'theta' and get the rexp in a list
library(tidyverse)
test1 <- test %>%
pull(theta) %>%
map(~rexp(n, .)) %>%
mutate(test, data = .)
str(test1$data)
#List of 5
# $ : num [1:5] 5.88 7.94 1.64 3.3 11.25
# $ : num [1:5] 4.5942 0.5424 1.7479 0.0469 0.9573
# $ : num [1:5] 1.192 2.447 0.239 1.497 2.359
# $ : num [1:5] 1.2323 0.0996 1.5778 0.1278 0.6982
# $ : num [1:5] 0.15 0.733 0.19 3.548 2.08
The list column can be unnested
test1 %>%
unnest(data)
data
S <- 5
n <- 5
test <- tibble(
s = 1:S,
theta = rgamma(S, shape = 10*s, rate = 50)
)