R:splitting the dataset into groups for forecast - r

i have dataset and i have to perform daily forecast splited by groups.
The group is client+stuff
ts <- read.csv("C:/Users/Admin/Desktop/mydat.csv",sep=";", dec=",")
here mydat
structure(list(Data = structure(c(1L, 3L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 2L, 4L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L), .Label = c("01.04.2017",
"01.06.2017", "02.04.2017", "02.06.2017", "03.04.2017", "04.04.2017",
"05.04.2017", "06.04.2017", "07.04.2017", "08.04.2017", "09.04.2017",
"10.04.2017", "11.04.2017", "12.05.2017", "13.05.2017", "14.05.2017",
"15.05.2017", "16.05.2017", "17.05.2017", "18.05.2017", "19.05.2017",
"20.05.2017", "21.05.2017", "22.05.2017", "23.05.2017", "24.05.2017",
"25.05.2017", "26.05.2017", "27.05.2017", "28.05.2017", "29.05.2017",
"30.05.2017", "31.05.2017"), class = "factor"), client = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Horns and hooves", "Kornev & Co."
), class = "factor"), stuff = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("chickens", "hooves", "Oysters"), class = "factor"),
Продажи = c(374L, 12L, 120L, 242L, 227L, 268L, 280L, 419L,
12L, 172L, 336L, 117L, 108L, 150L, 90L, 117L, 116L, 146L,
120L, 211L, 213L, 67L, 146L, 118L, 152L, 122L, 201L, 497L,
522L, 65L, 268L, 441L, 247L, 348L, 445L, 477L, 62L, 226L,
476L, 306L)), .Names = c("Data", "client", "stuff", "Продажи"
), class = "data.frame", row.names = c(NA, -40L))
of course I can manually separate three datasets
horns and hooves + hooves
Horns and hooves + chickens
Kornev & Co. + oysters
but what to do in the case when I have a huge dataset and there are hundreds of groups. Do not manually split.
Is it possible to split it in R into groups and then perform a forecast?
the code for forecast is simple
The first i do so
library(forecast)
library(lubridate)
msts <- msts(ts$sales,seasonal.periods = c(7,365.25),start = decimal_date(as.Date("2017-05-12")))
plot(msts, main="sales", xlab="Year", ylab="sales")
tbats <- tbats(msts)
plot(tbats, main="Multiple Season Decomposition")
sp<- predict(tbats,h=14) #14 days forecast
plot(sp, main = "TBATS Forecast", include=14)
print(sp)
if the result does not suit me, I'm perform forecast via dummy variables
tsw <- ts(ts$Sales, start = decimal_date(as.Date("2017-05-12")), frequency = 7)
View(tsw)
mytslm <- tslm(tsw ~ trend + season)
print(mytslm)
residarima1 <- auto.arima(mytslm$residuals)
residualsArimaForecast <- forecast(residarima1, h=14)
residualsF <- as.numeric(residualsArimaForecast$mean)
regressionForecast <- forecast(mytslm,h=14)
regressionF <- as.numeric(regressionForecast$mean)
forecastR <- regressionF+residualsF
print(forecastR)

You can use split to split the data into groups by a combination of factors, in this case columns client and stuff.
group_list <- split(mydat, list(mydat$client, mydat$stuff))
group_list <- group_list[sapply(group_list, function(x) nrow(x) != 0)]
Then you can use this list and lapply any function you want. The following is how you would perform your first forecast. Note that I have separated the forecast code from the plots code and that each step of the forecast is done by one function, first apply function msts and produce a list of such objects, then apply function tbats and produce another list.
fun_msts <- function(ts){
msts(ts$Sales, seasonal.periods = c(7,365.25), start = decimal_date(as.Date("2017-05-12")))
}
fun_sp <- function(m){
tbats <- tbats(m)
predict(tbats, h=14) #14 days forecast
}
msts_list <- lapply(group_list, fun_msts)
sp_list <- lapply(msts_list, fun_sp)
Now if you want to, you can plot the results. In order to do that, define two other functions to be lapplyed.
plot_msts <- function(m, new.window = TRUE){
if(new.window) windows()
plot(m, main="Sales", xlab="Year", ylab="Sales")
}
plot_sp <- function(sp, new.window = TRUE){
if(new.window) windows()
plot(sp, main = "TBATS Forecast", include = 14)
}
lapply(msts_list, plot_msts)
lapply(sp_list, plot_sp)
In these functions a new graphic device is open with function windows. If you are not using Microsoft Windows or if you want to open another type of device, change that instruction but keep the if(new.window).
EDIT.
As for the regression with dummy variables, you can do the following.
fun_tslm <- function(x, start = "2017-05-12", freq = 7){
tsw <- ts(x[["Sales"]], start = decimal_date(as.Date(start)), frequency = freq)
#View(tsw)
mytslm <- tslm(tsw ~ trend + season)
mytslm
}
fun_forecast <- function(x, h = 14){
residarima1 <- auto.arima(x[["residuals"]])
residualsArimaForecast <- forecast(residarima1, h = h)
residualsF <- as.numeric(residualsArimaForecast$mean)
regressionForecast <- forecast(x, h = h)
regressionF <- as.numeric(regressionForecast$mean)
forecastR <- regressionF + residualsF
forecastR
}
tslm_list <- lapply(group_list, fun_tslm)
fore_list <- lapply(tslm_list, fun_forecast)

Related

Forecast: list of length 3 not meaningful when calculate MAPE in R

In this data
timeseries=structure(list(Data = structure(c(10L, 14L, 18L, 22L, 26L, 29L,
32L, 35L, 38L, 1L, 4L, 7L, 11L, 15L, 19L, 23L, 27L, 30L, 33L,
36L, 39L, 2L, 5L, 8L, 12L, 16L, 20L, 24L, 28L, 31L, 34L, 37L,
40L, 3L, 6L, 9L, 13L, 17L, 21L, 25L), .Label = c("01.01.2018",
"01.01.2019", "01.01.2020", "01.02.2018", "01.02.2019", "01.02.2020",
"01.03.2018", "01.03.2019", "01.03.2020", "01.04.2017", "01.04.2018",
"01.04.2019", "01.04.2020", "01.05.2017", "01.05.2018", "01.05.2019",
"01.05.2020", "01.06.2017", "01.06.2018", "01.06.2019", "01.06.2020",
"01.07.2017", "01.07.2018", "01.07.2019", "01.07.2020", "01.08.2017",
"01.08.2018", "01.08.2019", "01.09.2017", "01.09.2018", "01.09.2019",
"01.10.2017", "01.10.2018", "01.10.2019", "01.11.2017", "01.11.2018",
"01.11.2019", "01.12.2017", "01.12.2018", "01.12.2019"), class = "factor"),
client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Horns", "Kornev"), class = "factor"), stuff = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("chickens",
"hooves", "Oysters"), class = "factor"), Sales = c(374L,
12L, 120L, 242L, 227L, 268L, 280L, 419L, 12L, 172L, 336L,
117L, 108L, 150L, 90L, 117L, 116L, 146L, 120L, 211L, 213L,
67L, 146L, 118L, 152L, 122L, 201L, 497L, 522L, 65L, 268L,
441L, 247L, 348L, 445L, 477L, 62L, 226L, 476L, 306L)), .Names = c("Data",
"client", "stuff", "Sales"), class = "data.frame", row.names = c(NA,
-40L))
Create forecast by group
# first the grouping variable
timeseries$group <- paste0(timeseries$client,timeseries$stuff)
# determine all groups
groups <- unique(timeseries$group)
# find starting date per group and save them as a list of elements c('YEAR','Month')
timeseries$date <- as.Date(as.character(timeseries$Data), '%d.%m.%Y')
timeseries <- timeseries[order(timeseries$date),]
start_dates <- format(timeseries$date[match(groups, timeseries$group)], "%Y %m")
start_dates <- strsplit(start_dates, ' ')
# now the list
listed <- split(timeseries,timeseries$group)
str(listed)
# Edited the lapply funcion in order to consider the starting dates
# to have a smaller output, I post the str(listed)
library("forecast")
library("lubridate")
listed_ts <- lapply(seq_along(listed),
function(k) ts(listed[[k]][["Sales"]], start = as.integer(start_dates[[k]]), frequency = 12) )
listed_ts
listed_arima <- lapply(listed_ts,function(x) auto.arima(x,allowmean = F ))
#Now the forecast for each arima:
listed_forecast <- lapply(listed_arima,function(x) forecast(x,5) )
listed_forecast
do.call(rbind,listed_forecast)
lapply(listed_arima, fitted)
#As a side comment, note that the solution is equivalent to
lapply(listed_arima, function(x) fitted(x))
#For the same reason you may also use AIC Metrix
listed_arima <- lapply(listed_ts, auto.arima)
So I want calculate MAPE using library("MLmetrics")
Let's check help
?MAPE(y_pred, y_true)
y_true is timeseries data and y_pred is the result of lapply(listed_arima, fitted)
So I do so
MAPE(lapply(listed_arima, fitted), timeseries)
and get the error
Error in Ops.data.frame(y_true, y_pred) : list of length 3 not meaningful
What's wrong? Why I can't calculate MAPE metrics using this function from library("MLmetric)?
How for each group i can calculate MAPE?
So as output i want data frame like in my example
How to reach this output?
What you need is
mapply(MAPE, lapply(listed_arima, fitted), split(timeseries$Sales, timeseries$group))
# [1] 3.4659421 0.8926123 0.2577634
In this way we apply MAPE to each pair of elements of lists lapply(listed_arima, fitted) and split(timeseries$Sales, timeseries$group).
The issue with
MAPE(lapply(listed_arima, fitted), timeseries)
is that lapply(listed_arima, fitted) is a list while the first argument to MAPE has to be a vector, and also that timeseries is a data frame rather than just a single column.

indicate ts frequency by group in R

timeseries=structure(list(Data = structure(c(10L, 14L, 18L, 22L, 26L, 29L,
32L, 35L, 38L, 1L, 4L, 7L, 11L, 15L, 19L, 23L, 27L, 30L, 33L,
36L, 39L, 2L, 5L, 8L, 12L, 16L, 20L, 24L, 28L, 31L, 34L, 37L,
40L, 3L, 6L, 9L, 13L, 17L, 21L, 25L), .Label = c("01.01.2018",
"01.01.2019", "01.01.2020", "01.02.2018", "01.02.2019", "01.02.2020",
"01.03.2018", "01.03.2019", "01.03.2020", "01.04.2017", "01.04.2018",
"01.04.2019", "01.04.2020", "01.05.2017", "01.05.2018", "01.05.2019",
"01.05.2020", "01.06.2017", "01.06.2018", "01.06.2019", "01.06.2020",
"01.07.2017", "01.07.2018", "01.07.2019", "01.07.2020", "01.08.2017",
"01.08.2018", "01.08.2019", "01.09.2017", "01.09.2018", "01.09.2019",
"01.10.2017", "01.10.2018", "01.10.2019", "01.11.2017", "01.11.2018",
"01.11.2019", "01.12.2017", "01.12.2018", "01.12.2019"), class = "factor"),
client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Horns", "Kornev"), class = "factor"), stuff = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("chickens",
"hooves", "Oysters"), class = "factor"), Sales = c(374L,
12L, 120L, 242L, 227L, 268L, 280L, 419L, 12L, 172L, 336L,
117L, 108L, 150L, 90L, 117L, 116L, 146L, 120L, 211L, 213L,
67L, 146L, 118L, 152L, 122L, 201L, 497L, 522L, 65L, 268L,
441L, 247L, 348L, 445L, 477L, 62L, 226L, 476L, 306L)), .Names = c("Data",
"client", "stuff", "Sales"), class = "data.frame", row.names = c(NA,
-40L))
here three groups
Kornev Oysters 01.03.2018 - 01.06.2019
Horns hooves 01.07.2019 - 01.07.2020
Horns chickens 01.04.2017 - 01.02.2018
As we can see these groups have different time start.
in my code
create forecast by group
# first the grouping variable
timeseries$group <- paste0(timeseries$client,timeseries$stuff)
# now the list
listed <- split(timeseries,timeseries$group)
# to have a smaller output, I post the str(listed)
library("forecast")
library("lubridate")
listed_ts <- lapply(listed,
function(x) ts(x[["Sales"]], start = c(2017, 1), frequency = 12) )
listed_ts
listed_arima <- lapply(listed_ts,function(x) auto.arima(x) )
#Now the forecast for each arima:
listed_forecast <- lapply(listed_arima,function(x) forecast(x,5) )
listed_forecast
do.call(rbind,listed_forecast)
get initial values
lapply(listed_arima, fitted)
lapply(listed_arima, function(x) fitted(x))
listed_arima <- lapply(listed_ts, auto.arima)
The row where indicate start -
listed_ts <- lapply(listed,
function(x) ts(x[["Sales"]], start = c(2017, 1), frequency = 12) )
but some groups can start from 2018.start = c(2018, 3)
How to do that for each group individually indicated the starting time!?
I can't indicate start time for each group separately manually, cause can be more than 100 groups
This might be of help
# determine all groups
groups <- unique(timeseries$group)
# find starting date per group and save them as a list of elements c('YEAR','Month')
timeseries$date <- as.Date(as.character(timeseries$Data), '%d.%m.%Y')
timeseries <- timeseries[order(timeseries$date),]
start_dates <- format(timeseries$date[match(groups, timeseries$group)], "%Y %m")
start_dates <- strsplit(start_dates, ' ')
# Back to your code
# now the list
listed <- split(timeseries,timeseries$group)
# Edited the lapply funcion in order to consider the starting dates
# to have a smaller output, I post the str(listed)
listed_ts <- lapply(seq_along(listed),
function(k) ts(listed[[k]][["Sales"]], start = as.integer(start_dates[[k]]), frequency = 12) )
Explanation
Figure out for each group what the starting date is and save it as a vector c('Year','Month')
All those starting dates are saved in a list start_dates
modify the lapply function in order to consider the starting dates

In R, how do I order within a single column so one category is ascending and one is descending?

I am generating multiple experimental designs of different sizes and shapes. This is done using a function dependent on the agricolae package (I’ve included it below). To generate practical data sheets for field operations I need to order the data frame by Row, then for odd Rows sort the Range ascending and for even Rows sort it descending.
Using sort, order, rep and seq I have been able to find a simple solution to this. Any suggestions are greatly appreciated!
So the data frame will go from something like this:
df1 <- structure(list(Block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), Range = c(1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), Row = c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L
), Plot = c(101L, 201L, 301L, 401L, 102L, 202L, 302L, 402L, 103L,
203L, 303L, 403L, 104L, 204L, 304L, 404L, 105L, 205L, 305L, 405L,
106L, 206L, 306L, 406L, 107L, 207L, 307L, 407L, 108L, 208L, 308L,
408L, 109L, 209L, 309L, 409L, 110L, 210L, 310L, 410L, 111L, 211L,
311L, 411L, 112L, 212L, 312L, 412L), Entry.Num = c(14L, 26L,
18L, 4L, 52L, 17L, 41L, 47L, 40L, 30L, 21L, 12L, 9L, 2L, 8L,
36L, 25L, 43L, 15L, 6L, 33L, 48L, 54L, 37L, 9L, 18L, 8L, 41L,
48L, 28L, 7L, 47L, 54L, 38L, 46L, 23L, 19L, 1L, 3L, 27L, 36L,
14L, 12L, 33L, 16L, 24L, 31L, 2L)), .Names = c("Block", "Range",
"Row", "Plot", "Entry.Num"), class = "data.frame", row.names = c(NA,
-48L))
To something like this:
df2 <- structure(list(Block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), Range = c(1L, 2L, 3L, 4L, 4L, 3L,
2L, 1L, 1L, 2L, 3L, 4L, 4L, 3L, 2L, 1L, 1L, 2L, 3L, 4L, 4L, 3L,
2L, 1L, 1L, 2L, 3L, 4L, 4L, 3L, 2L, 1L, 1L, 2L, 3L, 4L, 4L, 3L,
2L, 1L, 1L, 2L, 3L, 4L, 4L, 3L, 2L, 1L), Row = c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L
), Plot = c(101L, 201L, 301L, 401L, 402L, 302L, 202L, 102L, 103L,
203L, 303L, 403L, 404L, 304L, 204L, 104L, 105L, 205L, 305L, 405L,
406L, 306L, 206L, 106L, 107L, 207L, 307L, 407L, 408L, 308L, 208L,
108L, 109L, 209L, 309L, 409L, 410L, 310L, 210L, 110L, 111L, 211L,
311L, 411L, 412L, 312L, 212L, 112L), Entry.Num = c(14L, 26L,
18L, 4L, 47L, 41L, 17L, 52L, 40L, 30L, 21L, 12L, 36L, 8L, 2L,
9L, 25L, 43L, 15L, 6L, 37L, 54L, 48L, 33L, 9L, 18L, 8L, 41L,
47L, 7L, 28L, 48L, 54L, 38L, 46L, 23L, 27L, 3L, 1L, 19L, 36L,
14L, 12L, 33L, 2L, 31L, 24L, 16L)), .Names = c("Block", "Range",
"Row", "Plot", "Entry.Num"), class = "data.frame", row.names = c(NA,
-48L))
In case you're interested, this is the trial design function. There is undoubtedly a more elegant way to do this but I am not particularly good at R:
Trial.Design <- function(Total.Entries, Rows.per.Block, Ranges.per.Block, Trial.Name){
library(agricolae)
library(reshape2)
#########################################################################################
# Generate a trial design #
#########################################################################################
total.trt <- Total.Entries
if(total.trt%%2) # If the variety number is uneven it will return the following error message
stop("WARNING: Variety number is uneven! Subsequent script will not work correctly!")
blocks <- 4 # This is fixed, we are unlikely to use a different block number in any trial.
trt<-c(1:total.trt) # You could in theory have the variety names here.
# This function from agricolae generates a statistically sound trial design.
outdesign <-design.rcbd(trt, blocks, serie=0,continue=TRUE,986,"Wichmann-Hill") # seed for ranomization = 986
# This uses an agricolae function to print the "field book" of the trial.
book <-outdesign$book # field book
#########################################################################################
# Generate blocking in two directions #
#########################################################################################
# The following generates an appropriately blocked map. The idea is block in two directions.
# We use this design so that the blocking structure captures field trends both down and across the field.
Block.Rows <- Rows.per.Block
Block.Ranges <- Ranges.per.Block
ifelse(total.trt==Block.Rows*Block.Ranges, "Entry number is okay",
stop("WARNING: Block is uneven and/or does not equal entry number! Subsequent script will not work correctly!"))
Block <- matrix(rep(1, times=total.trt))
Range <- matrix(rep(1:Block.Rows, times=Block.Ranges))
Row <- matrix(rep(1:Block.Ranges, each=Block.Rows))
Block.1 <- cbind(Block, Range)
Block.1 <- cbind(Block.1, Row)
Block <- matrix(rep(3, times=total.trt))
Range <- matrix(rep((Block.Rows+1):(Block.Rows*2), times=Block.Ranges))
Row <- matrix(rep(1:Block.Ranges, each=Block.Rows))
Block.3 <- cbind(Block, Range)
Block.3 <- cbind(Block.3, Row)
Block <- matrix(rep(2, times=total.trt))
Range <- matrix(rep(1:Block.Rows, times=Block.Ranges))
Row <- matrix(rep((Block.Ranges+1):(Block.Ranges*2), each=Block.Rows))
Block.2 <- cbind(Block, Range)
Block.2 <- cbind(Block.2, Row)
Block <- matrix(rep(4, times=total.trt))
Range <- matrix(rep((Block.Rows+1):(Block.Rows*2), times=Block.Ranges))
Row <- matrix(rep((Block.Ranges+1):(Block.Ranges*2), each=Block.Rows))
Block.4 <- cbind(Block, Range)
Block.4 <- cbind(Block.4, Row)
# The following adds the coordinates generated above to our field book.
Field.book <- rbind(Block.1, Block.2)
Field.book <- rbind(Field.book, Block.3)
Field.book <- rbind(Field.book, Block.4)
Plots <- as.matrix(rep(1:(total.trt*4)))
Field.book <- cbind(Plots, Field.book)
# Generate temporary Range names.
colnames(Field.book) <- c("plots", "block", "range", "row")
Field.book <- as.data.frame(Field.book)
Field.book$range <- as.numeric(Field.book$range)
Field.book$row <- as.numeric(Field.book$row)
# This joins the experimental design generated by agricolae to the plot layout generated above.
Field.book <- join(Field.book, book, by= c("plots","block"))
# Generate better Range names.
colnames(Field.book) <- c("Plot.Num", "Block", "Range", "Row", "Entry.Num")
# Create Plot coordinates.
Field.book$Plot <- (Field.book$Range * 100) + Field.book$Row
# Reorders the Ranges to something more intuitive.
# I drop the 'plot number' Range generated by agricolae because I don't think it is useful or necessary in our case.
Field.book <- Field.book[c("Block", "Range", "Row", "Plot", "Entry.Num")]
# Sort the plots by Range and Row.
Field.book <- Field.book[order(Field.book$Range, Field.book$Row),]
Field.book <<- Field.book
# Convert the Ranges to factors to allow for conversion to a 'wide' format.
Field.book$Block <- as.factor(Field.book$Block)
Field.book$Range <- as.factor(Field.book$Range)
Field.book$Row <- as.factor(Field.book$Row)
Field.book$Plot <- as.factor(Field.book$Plot)
#########################################################################################
# Generate plot maps #
#########################################################################################
# This function rotates the design if it's deemed necessary.
# rotate <- function(x) t(apply(x, 2, rev))
Field.design.num <- dcast(Field.book, Row ~ Range, value.var = "Entry.Num")
Field.design.num$Row <- as.numeric(Field.design.num$Row)
Field.design.num <- Field.design.num[order(-Field.design.num$Row),]
Field.book$Plot <- as.factor(Field.book$Plot)
colnames(Field.design.num)[2:ncol(Field.design.num)] <- paste("Row", colnames(Field.design.num[,c(2:ncol(Field.design.num))]), sep = "-")
Field.design.num$Row <- sub("^", "Range-", Field.design.num$Row)
#rotate(Field.design.num)
Field.design.num <<- Field.design.num
Field.design.plot <- dcast(Field.book, Row ~ Range, value.var = "Plot")
Field.design.plot$Row <- as.numeric(Field.design.plot$Row)
Field.design.plot <- Field.design.plot[order(-Field.design.plot$Row),]
Field.book$Plot <- as.factor(Field.book$Plot)
colnames(Field.design.plot)[2:ncol(Field.design.plot)] <- paste("Row", colnames(Field.design.plot[,c(2:ncol(Field.design.plot))]), sep = "-")
Field.design.plot$Row <- sub("^", "Range-", Field.design.plot$Row)
#rotate(Field.design.plot)
Field.design.plot <<- Field.design.plot
Field.design.Block <- dcast(Field.book, Row ~ Range, value.var = "Block")
Field.design.Block$Row <- as.numeric(Field.design.Block$Row)
Field.design.Block <- Field.design.Block[order(-Field.design.Block$Row),]
Field.book$Block <- as.factor(Field.book$Block)
colnames(Field.design.Block)[2:ncol(Field.design.Block)] <- paste("Row", colnames(Field.design.Block[,c(2:ncol(Field.design.Block))]), sep = "-")
Field.design.Block$Row <- sub("^", "Range-", Field.design.Block$Row)
#rotate(Field.design.Block)
Field.design.Block <<- Field.design.Block
#########################################################################################
# Write the files #
#########################################################################################
write.csv(Field.book, paste("Field Book",Trial.Name,".csv"), row.names=FALSE)
write.csv(Field.design.num, paste("Field map Entry",Trial.Name,".csv"), row.names=FALSE)
write.csv(Field.design.plot, paste("Field map Plots",Trial.Name,".csv"), row.names=FALSE)
write.csv(Field.design.Block, paste("Field map Blocks",Trial.Name,".csv"), row.names=FALSE)
#########################################################################################
}
# The parameters are:
# The total number of entires/varieties in a replicate (NOTE: The number of entries must be an even number).
# The number of rows in an individual block/replicate.
# The number of ranges in an individual block/replicate.
# (NOTE: The number of rows and ranges must multiply to give the number of entries.)
# The trial name is what will be written to your working directory.
Total.Entries = 54
Rows.per.Block = 9
Ranges.per.Block = 6
Trial.Name = "Example"
Trial.Design (Total.Entries, Rows.per.Block, Ranges.per.Block, Trial.Name)
The magic of order awaits you:
df1[order(df1$Row, c(-1,1)[df1$Row %% 2 + 1] * df1$Range ),]
Essentially what this does is order by Row, then by Range, multiplied by -1 if it is even. x %% 2 can be used to check for odd/even status.
all.equal(
df1[order(df1$Row, c(-1,1)[df1$Row %% 2 + 1] * df1$Range ),],
df2,
check.attributes=FALSE
)
#[1] TRUE

qplot creates a blank plot but with correct axes and legend

I am trying to create a barchart for the number of passes per date for 3 sites. Here is my initial working:
Test <- read.csv("Test.csv")
head(Test)
str(Test)
> head(Test)
Date Site Passes
1 20111223 Met Mast 10m 8
2 20111224 Met Mast 10m 4
3 20111225 Met Mast 10m 27
4 20111226 Met Mast 10m 4
5 20111227 Met Mast 10m 20
6 20111228 Met Mast 10m 11
> str(Test)
'data.frame': 26 obs. of 3 variables:
$ Date : int 20111223 20111224 20111225 20111226 20111227 20111228 20111229
20111230 20111231 20111223 ...
$ Site : Factor w/ 3 levels "Control","Met Mast 10m",..: 2 2 2 2 2 2 2 2 2 3 ...
$ Passes: int 8 4 27 4 20 11 17 0 41 6 ...
I am using ggplot so I load the library and use qplot:
library(ggplot2)
qplot(Date, data=Test, geom ="bar", position="dodge", fill=Site, binwidth=0.5,
weight = Passes) +
scale_y_continuous("Number of Passes")
I get a perfect graph (sorry I cant post an image) thats exactly what I want with date on the x-axis, number of passes on the y-axis, and the 3 factors plotted side-by-side next to each other for each date with a legend.
I tested my method and code on a subset of my data (i.e. only 26 observations) to make sure it works which it did. Now I want to plot the same graph for all of my data which consists of 97 observations.
The issue:
When I run the code for all my data the output is a blank graph with correctly labeled axes and legend. On the x-axis the date format changed from the usuall 20111223 to 20112500 which is non-sensical and I dont know why.
Ive read many of the ggplot2 and qplot questions on this forum but I have not found anyone who has had a similar issue. Because I got the plot to work with few data points I think it may be an issue of the x-axis scale but Im not sure. Why is the plot blank?
Please help :)
> dput(Test)
structure(list(Date = c(20111223L, 20111224L, 20111225L, 20111226L,
20111227L, 20111228L, 20111229L, 20111230L, 20111231L, 20120101L,
20120102L, 20120103L, 20120104L, 20120105L, 20120106L, 20120107L,
20120108L, 20120109L, 20120110L, 20120111L, 20120112L, 20120113L,
20120114L, 20120115L, 20120116L, 20120117L, 20120118L, 20120119L,
20120120L, 20120121L, 20120122L, 20120123L, 20120124L, 20120125L,
20120126L, 20120127L, 20111223L, 20111224L, 20111225L, 20111226L,
20111227L, 20111228L, 20111229L, 20111230L, 20111231L, 20120101L,
20120102L, 20120103L, 20120104L, 20120105L, 20120106L, 20120107L,
20120108L, 20120109L, 20120110L, 20120111L, 20120112L, 20120113L,
20120114L, 20120115L, 20120116L, 20120117L, 20120118L, 20120119L,
20120120L, 20120121L, 20120122L, 20120123L, 20120124L, 20120125L,
20120126L, 20120127L, 20111222L, 20111223L, 20111224L, 20111225L,
20111226L, 20111227L, 20111228L, 20111229L, 20111230L, 20111231L,
20120101L, 20120102L, 20120103L, 20120104L, 20120105L, 20120106L,
20120107L, 20120108L, 20120109L, 20120110L, 20120111L, 20120112L,
20120113L, 20120114L, 20120115L), Site = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control",
"Met Mast 10m", "Met Mast 17m"), class = "factor"), Passes = c(8L,
4L, 27L, 4L, 20L, 11L, 17L, 0L, 41L, 6L, 9L, 29L, 38L, 5L, 38L,
12L, 6L, 14L, 18L, 35L, 36L, 8L, 23L, 25L, 0L, 0L, 0L, 37L, 53L,
59L, 205L, 15L, 2L, 14L, 6L, 6L, 6L, 6L, 29L, 4L, 26L, 12L, 12L,
3L, 21L, 20L, 17L, 31L, 79L, 10L, 59L, 12L, 6L, 18L, 21L, 27L,
31L, 14L, 40L, 37L, 0L, 0L, 0L, 52L, 52L, 63L, 0L, 30L, 2L, 14L,
14L, 4L, 41L, 296L, 115L, 470L, 171L, 589L, 96L, 306L, 44L, 131L,
244L, 75L, 217L, 252L, 304L, 313L, 250L, 315L, 121L, 298L, 247L,
222L, 161L, 335L, 173L)), .Names = c("Date", "Site", "Passes"
), class = "data.frame", row.names = c(NA, -97L))
If it works for a subset of your data, it is most likely that there is somehting unexpected in your data (NA's, etc). It's not possible to know exactly what is going on without seeing the data. If you provide the data with dput(data), you may get a better answer.

Analysing entire dataframes instead of separate subsets in r [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I have code that analyses Gforces in movement, that returns to me a plot of GForces over time. The script works for subsets of the data (categorized by ID) and I don't have a clue how to make it compatible for analyzing the entire dataframe at once, returning to me the results in a faceted graph (with ggplot for instance). Does anyone know how to do this or am i doomed to analyse my data by one ID at a time?
The code is:
#Subset by ID
number1 <-subset(positions,subset=(ID==1))
head(number1)
A <- numeric()
Al <- numeric()
Radius <- numeric()
GForce <- numeric()
D12 <- numeric()
D13 <- numeric()
D23 <- numeric()
Proportion <- numeric()
Proportion_sel <- numeric()
nr<-length(number1$Timestamp)
for(i in 3:nr){
D12[i] <- sqrt((positions$X[i-2]-positions$X[i-1])^2 + (positions$Y[i-2]-positions$Y[i-1])^2)
D23[i] <- sqrt((positions$X[i-1]-positions$X[i])^2 + (positions$Y[i-1]-positions$Y[i])^2)
D13[i] <- sqrt((positions$X[i-2]-positions$X[i])^2 + (positions$Y[i-2]-positions$Y[i])^2)
if (D13[i]>0) {
if ((((D12[i]^2+D13[i]^2-D23[i]^2)^2)/(4*(D13[i]^2))) > D12[i]^2) {
A[i] <- 0
Radius[i] <- 0
GForce[i] <- 0
}
else {
A[i] <- sqrt(D12[i]^2-(((D12[i]^2+D13[i]^2-D23[i]^2)^2)/(4*(D13[i]^2))))
Radius[i] <- ((0.5*D13[i])^2+(A[i])^2)/(2*A[i])
GForce[i] <- ((D12[i]+D23[i])/2)^2/Radius[i]
}
}
else {
A[i] <- 0
Radius[i] <- 0
GForce[i] <- 0
}
}
# GForce plot over Time
plot(number1$Timestamp,GForce)
This returns to me:
- a plot of Gforces over time
The problem seems to arise with finding a way to return the GForce per ID and return them as an additional column to the original dataframe. When this would be possible, it would also be easy to plot per ID using ggplot but I don't know how to accomplish this..
Data (simplified):
dput(positions)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), Timestamp = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L
), X = c(-34.126, -34.087, -34.047, -34.01, -33.983, -33.973,
-33.985, -34.019, -34.07, -34.13, -34.188, -34.237, -34.273,
-34.295, -34.305, -34.307, -34.305, -34.301, -34.299, -34.297,
-34.296, -34.29, -34.272, -34.235, -34.178, -34.107, -34.034,
-33.967, -33.91, -33.865, -33.832, -33.81, -33.799, -33.8, -33.814,
-33.841, -33.878, -33.923, -33.975, -34.033, -34.098, -34.17,
-34.243, -34.311, -34.366, -33.691, -33.646, -33.598, -33.547,
-33.497, -33.452, -33.414, -33.383, -33.357, -33.331, -33.302,
-33.268, -33.234, -33.203, -33.179, -33.163, -33.154, -33.145,
-33.132, -33.111, -33.081, -33.041, -32.993, -32.937, -32.873,
-32.807, -32.74, -32.676, -32.611, -32.543, -32.468, -32.384,
-32.293, -32.199, -32.109, -32.029, -31.959, -31.899, -31.846,
-31.796, -31.749, -31.704, -31.659, -31.612, -31.561), Y = c(3.393,
3.396, 3.398, 3.402, 3.408, 3.419, 3.434, 3.452, 3.471, 3.489,
3.506, 3.518, 3.525, 3.526, 3.523, 3.518, 3.513, 3.511, 3.511,
3.513, 3.516, 3.517, 3.51, 3.494, 3.467, 3.434, 3.402, 3.376,
3.358, 3.348, 3.343, 3.343, 3.346, 3.351, 3.36, 3.373, 3.39,
3.408, 3.43, 3.453, 3.479, 3.506, 3.532, 3.555, 3.57, 6.684,
6.757, 6.823, 6.887, 6.953, 7.024, 7.099, 7.174, 7.245, 7.307,
7.363, 7.414, 7.466, 7.52, 7.579, 7.643, 7.71, 7.781, 7.853,
7.923, 7.988, 8.047, 8.098, 8.139, 8.173, 8.202, 8.228, 8.251,
8.27, 8.285, 8.298, 8.314, 8.337, 8.372, 8.423, 8.487, 8.558,
8.628, 8.688, 8.735, 8.768, 8.792, 8.814, 8.843, 8.881)), .Names = c("ID",
"Timestamp", "X", "Y"), row.names = c(NA, 90L), class = "data.frame")
I join #Arun, You need to explain what are you doing. The code is not really helpful, specially if is not well written and have some errors.
That's said , if I look in you code , you do a procees by ID. One idea is to put your code in a function and call it for each ID using lapply.
You create 3 plots by ID, So you can create a matrix layout , where you plot the 3 plots in each row.
ids <- unique(positions$ID)
layout(matrix(1:(length(ids)*3),ncol=3,byrow=TRUE))
Then you call your process for each ID, for example:
lapply(ids,function(myID){
number1 <-subset(positions,subset=(ID==myID))
.....
# GForce calculation
plot(number1$Timestamp,GForce) ## I change one line
...
sum(GForce[3:max])
})

Resources