Data
Here is the dput of my data:
heart <- structure(list(died = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), levels = c("Survive",
"Died"), class = c("labelled", "factor"), label = "Death", format = "%8.0g", value.label.table = structure(0:1, names = c("Survive",
"Died"))), procedure = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), levels = c("PTCA",
"CABG"), class = c("labelled", "factor"), label = "1=CABG; 0=PTCA", format = "%8.0g", value.label.table = structure(0:1, names = c("PTCA",
"CABG"))), age = structure(c(65L, 69L, 76L, 65L, 69L, 67L, 69L,
66L, 74L, 67L, 76L, 76L, 68L, 77L, 77L, 70L, 68L, 69L, 68L, 70L,
71L, 75L, 69L, 72L, 66L, 77L, 68L, 78L, 77L, 69L, 65L, 70L, 65L,
70L), label = "PATIENT AGE", class = c("labelled", "integer"), format = "%8.0g"),
gender = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L), levels = c("Female",
"Male"), class = c("labelled", "factor"), label = "Gender", format = "%8.0g", value.label.table = structure(0:1, names = c("Female",
"Male"))), los = structure(c(10L, 7L, 7L, 8L, 1L, 7L, 2L,
9L, 3L, 1L, 6L, 14L, 4L, 13L, 10L, 4L, 2L, 2L, 10L, 2L, 6L,
12L, 4L, 2L, 2L, 14L, 3L, 2L, 5L, 9L, 3L, 3L, 3L, 2L), label = "LOS", class = c("labelled",
"integer"), format = "%8.0g"), type = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L), levels = c("Elective", "Emer/Urg"), class = c("labelled",
"factor"), label = "Severity", format = "%8.0g", value.label.table = structure(0:1, names = c("Elective",
"Emer/Urg")))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-34L), stata.info = list(datalabel = "AZ 1991; CABG(106,107) & PTCA(112)",
version = 12L, time.stamp = "17 Feb 2015 12:45", val.labels = c("diedlb",
"pxllb", "", "sexllb", "", "typellb"), label.table = list(
pxllb = structure(0:1, names = c("PTCA", "CABG")), typellb = structure(0:1, names = c("Elective",
"Emer/Urg")), sexllb = structure(0:1, names = c("Female",
"Male")), diedlb = structure(0:1, names = c("Survive",
"Died")))))
Problem
So far I have fit multiple models like this using tensor splines to encode main and interaction effects:
#### Load Libraries ####
library(oddsratio)
library(mgcv)
library(splines)
library(tidyverse)
#### Fit GAM With Tensor Splines ####
fit <- gam(factor(died)
~ ti(age)
+ ti(los)
+ ti(age,los),
data = heart,
family = binomial)
However, when I try to plot the main effects:
plot_gam(model = fit,
pred = "age")
I get this odd warning:
Error in plot_df[[set_pred]]$x : $ operator is invalid for atomic vectors
I tried calling the function plot_gam explicitly, which look like this, but I'm a bit fuzzy on why this code is causing the issue. Calling whatever gam_to_df is also didn't seem to clarify the problem:
function (model = NULL, pred = NULL, col_line = "blue", ci_line_col = "black",
ci_line_type = "dashed", ci_fill = "grey", ci_alpha = 0.4,
ci_line_size = 0.8, sm_fun_size = 1.1, title = NULL, xlab = NULL,
ylab = NULL, limits_y = NULL, breaks_y = NULL)
{
df <- gam_to_df(model, pred)
if (is.null(xlab)) {
xlab <- df[[pred]]$xlab
}
if (is.null(ylab)) {
ylab <- df[[pred]]$ylab
}
plot_gam <- ggplot(df, aes_(~x, ~y)) + geom_line(colour = col_line,
size = sm_fun_size) + geom_line(aes_(~x, ~se_upr), linetype = ci_line_type,
colour = ci_line_col, size = ci_line_size) + geom_line(aes_(~x,
~se_lwr), linetype = ci_line_type, colour = ci_line_col,
size = ci_line_size) + geom_ribbon(aes_(x = ~x, ymin = ~se_lwr,
ymax = ~se_upr), fill = ci_fill, alpha = ci_alpha) +
ylab(ylab) + xlab(xlab)
if (!is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y),
limits = c(limits_y))
}
else if (!is.null(limits_y) & is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(limits = limits_y)
}
else if (is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y))
}
if (!is.null(title)) {
plot_gam <- plot_gam + ggtitle(title)
}
return(plot_gam)
}
I would greatly appreciate a resolution to this issue. The plot_gam function is super useful for what I'm trying to achieve.
I was inspecting the code. Error is caused by a fairly naïve implementation of oddsratio::gam_to_df function.
gam_to_df <- function (model = NULL, pred = NULL)
{
plot_df <- no_plot(model)
set_pred <- grep(paste0("\\b", pred, "\\b"), plot_df)
df <- data.frame(x = plot_df[[set_pred]]$x, se_upr = plot_df[[set_pred]]$fit +
plot_df[[set_pred]]$se, se_lwr = plot_df[[set_pred]]$fit -
plot_df[[set_pred]]$se, y = plot_df[[set_pred]]$fit)
return(df)
}
no_plot <- function (model = NULL)
{
png("temp.xyz")
plot_df <- plot(model, pages = 1)
dev.off()
file.remove("temp.xyz")
return(invisible(plot_df))
}
The function first create a plot and return it (it is a list) then extract the element that contains "age" somewhere in one of the elements, searching it via grep. Since two of three elements contains "age" it returns a vector of two elelments c(1,3). Thus plot_df[[set_pred]] instead of selecting the first element, selects the first of the list and then the third element inside of the list, which is a vector.
Hence the error "operator is invalid for atomic vectors"
Appears to be a bug.
EDIT
Filled an issue at https://github.com/pat-s/oddsratio/issues/54
Related
I am trying to make a PCA plot using ggplot and geom_point.
I would like to illustrate 3 factors (Diet, Time, Antibiotics).
I thought I could outline the points in black for one factor).
However this isn't showing the third factor (Time) for the Fill color.
Here is a subset of my data:
> dput(dat.pcx.annot.test)
structure(list(PC1 = c(25.296379160162, 1.4703101394886, 11.4138097811008,
1.41798772574591, 23.7253675969881, 15.5683516005535, -34.6012195481675,
-25.7129281491955, -2.97230018393742, 4.83421092719293, -0.0274189140249825,
23.227939504077, 15.2002258785889, -35.2243685702227, -34.2537374460037,
-7.6380794043063), PC2 = c(27.2678813936857, -9.88577494210313,
-6.19394322321806, -8.88953660465497, 33.6791127012231, -13.2912233546802,
7.77877968081575, 2.7371646557436, -8.41929538502921, -11.5151849519265,
-9.40733576034963, 32.3549860618533, -11.2170071727855, 10.0455709347794,
3.05679707335492, -6.66218028060621), Diet = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L), .Label = c("RC",
"WD"), class = "factor"), Time = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("ZT14",
"ZT2"), class = "factor"), Antibiotics = structure(c(2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L), .Label = c("Antibiotics ",
"None"), class = "factor")), row.names = c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 18L, 19L, 20L, 21L, 22L), class = "data.frame")
Here is the plotting command :
ggplot(dat.pcx.annot.test,aes(x=PC1,y=PC2,color=Diet,shape=Antibiotics,Fill=Time))+
geom_point(size=3,alpha=0.5)+
scale_color_manual(values = c("black","white") )
And the plot it produces:
I thought if I had both color and fill specified then they would both show.
I would like black outlines for Antibiotics, and Fill color for Time.
Right now Time is not represented.
Any help on how to simultaneously view the 3 factors.
Thanks
Yes I had a fill typo. And I finally figured out how to get the legends to correspond. Here is my final answer.
ggplot(dat.pcx.annot,aes(x=PC1,y=PC2,color=Diet,shape=Antibiotics,fill=Time))+
geom_point(size=3)+
scale_shape_manual(values = c(21, 22) )+
scale_color_manual(values = c("black","white") )+
scale_fill_manual(values=c("#EC9DAE","#AEDE94"))+
xlab(PC1var)+
ylab(PC2var)+
guides(fill=guide_legend(override.aes=list(shape=21)))+
guides(color=guide_legend(override.aes=list(shape=21)))
guides(fill=guide_legend(override.aes=list(shape=21,fill=c("#EC9DAE","#AEDE94"),color=c("black","white"))))
ggsave("cohort2_pca.pdf")
Say, here the mydata (little part)
transport<- structure(list(date = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L), .Label = c("01.01.2001", "01.02.2001", "01.03.2001",
"01.04.2001", "01.05.2001", "01.06.2001", "01.07.2001", "01.08.2001",
"01.09.2001", "01.10.2001", "01.11.2001", "01.12.2001"), class = "factor"),
Market_82 = c(7000L, 7272L, 7668L, 7869L, 8057L, 8428L, 8587L,
8823L, 8922L, 9178L, 9306L, 9439L, 3725L, 4883L, 8186L, 7525L,
6335L, 4252L, 5642L, 1326L, 8605L, 3501L, 1944L, 7332L),
transport = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("plane", "train"), class = "factor")), .Names = c("date",
"Market_82", "transport"), class = "data.frame", row.names = c(NA,
-24L))
group variable - Transport.
For each type of transport i must get acf plot of time series.
something like this
How perform acf plot for each transport?
I have a lot of groups. How to do that plots were in folder
C:/Users/admin/Documents/myplot
akrun's answer is spot on. Since you tagged the question with ggplot2 you could also use ggAcf from the forcast package.
The first step is to split your data.
transport_split <- split(transport, transport$transport)
If you want to include the respective element of column transport in the title, subtitle etc. try with Map
out <- Map(
f = function(x, y)
forecast::ggAcf(x$Market_82) + labs(title = y),
x = transport_split,
y = names(transport_split)
)
out$train
We can do this with Acf from forecast
library(forecast)
par(mfrow = c(2, 1))
lapply(split(transport['Market_82'], transport$transport), Acf)
If we also want the title, then
lst <- lapply(split(transport['Market_82'], transport$transport), acf, plot = FALSE)
par(mfrow = c(2, 1))
lapply(names(lst), function(x) plot(lst[[x]], main = x))
I am trying to create a plot in ggplot showing the mean home range size of an animal according to different sexes, treatments, time periods and seasons. I get an error in R saying
Error: Aesthetics must be either length 1 or the same as the data (24): x, y, colour, shape"
I have read similar posts about this error but I haven't been able to figure it out yet. There are no NA's in these columns and my numerical variables are being treated as such. Not sure if the error has to do with a need to sub set the data but I don't understand how I should do that. My code runs fine up until the ggplot part and it is the following:
library("ggplot2")
library("dplyr")
lion_HR_size <- read.csv(file = "https://dl.dropboxusercontent.com/u/23723553/lion_sample_data.csv",
header= TRUE, row.names=1)
# Mean of home range size by season, treatment, sex and time
Mean_HR <- lion_HR_size %>%
group_by(season, treatment, sex, time) %>%
summarize(
mean_HR = mean(Area_HR_km),
se_HR = sd(Area_HR_km)/sqrt(n()),
lwrHR = mean_HR - se_HR,
uprHR = mean_HR + se_HR)
limitsHR <- aes(ymin = lwrHR, ymax= uprHR)
ggplot(Mean_HR,
aes(x=season,
y= Mean_HR,
colour=season,
shape= season)) +
geom_point( size = 6, alpha = 0.5)+
facet_grid(sex ~ treatment+time)+
geom_errorbar(limitsHR, width = 0.1, col = 'red', alpha = 0.8)+
theme_bw()
As requested, the dput(Mean_HR) output is the following:
dput(Mean_HR)
structure(list(season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Early_dry", "Late_dry", "Wet"), class = "factor"),
treatment = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("C", "E"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), mean_HR = c(141.594090181, 138.327188493,
509.287443507692, 345.296845642381, 157.634028930833, 184.202160663125,
252.464096340667, 255.078012825, 59.8485325981818, 143.158189516522,
439.990400912593, 175.410885601333, 221.338774452381, 100.942251723636,
127.961533612727, 167.199563142143, 120.60363022375, 142.351764574211,
249.03854219, 330.018734301176, 123.992902995714, 219.886321226667,
307.869373359167, 296.019550844286), se_HR = c(18.6245437612391,
29.2548378154774, 127.987824704623, 78.9236194797204, 20.8897993194466,
43.1314245224751, 57.6327505533691, 32.1129054260719, 9.383853530199,
38.7678333459788, 130.348285186224, 31.707304307485, 29.1561478797825,
15.4038723326613, 18.1932127432015, 37.791782522185, 32.7089231722616,
33.2629181623941, 46.1500408067739, 88.8736578370159, 15.8046627788777,
36.9665360444972, 70.1560303348504, 87.1340476758794), lwrHR = c(122.969546419761,
109.072350677523, 381.29961880307, 266.373226162661, 136.744229611387,
141.07073614065, 194.831345787298, 222.965107398928, 50.4646790679828,
104.390356170543, 309.642115726369, 143.703581293848, 192.182626572598,
85.5383793909751, 109.768320869526, 129.407780619958, 87.8947070514884,
109.088846411816, 202.888501383226, 241.145076464161, 108.188240216837,
182.91978518217, 237.713343024316, 208.885503168406), uprHR = c(160.218633942239,
167.582026308477, 637.275268212315, 424.220465122101, 178.52382825028,
227.3335851856, 310.096846894036, 287.190918251072, 69.2323861283808,
181.9260228625, 570.338686098816, 207.118189908818, 250.494922332163,
116.346124056298, 146.154746355929, 204.991345664328, 153.312553396012,
175.614682736605, 295.188582996774, 418.892392138192, 139.797565774592,
256.852857271164, 378.025403694017, 383.153598520165)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L), vars = list(
season, treatment, sex), drop = TRUE, .Names = c("season",
"treatment", "sex", "time", "mean_HR", "se_HR", "lwrHR", "uprHR"
))
Could someone help me understand this error and how to fix it in my code? Many thanks!
Not entirely sure myself why/how the limitsHR <- ... statement works. I would have expected it to stop on not being able to find the lwrHR and uprHR objects in the workspace.
Anyhow, ggplot has a nice function mean_se() that will help you tremendously.
ggplot(data = lion_HR_size, mapping = aes(x = season, y = Area_HR_km,
colour=season, shape= season)) +
stat_summary(fun.data = mean_se) +
facet_grid(sex ~ treatment+time)+
theme_bw()
I have 2 data frames for 2 stacks that gives information about potential emission. One data frame gives the time frame of what hours the system turn on and off for 4 seasons. Each season start on specific date. The 2nd file give me the details of the stack.
I am trying with some sample file to test how to do this and so far I have managed to create a function following stack overflow example that allow me to create a data frame with the dates that I would like and a column with seasons for each date. I am really struggling now with the programming concept to understand how do I combine the 3 data frames to create the output template that I am trying to set up.
To show you an example my sample input are:
Stack_info File:
example seasonal Profile that shows when the system is on or off:
and the output I am after should create data frames for each year in the following format (only the black font and the red text to just explain what the values are):
What is the most difficult I am finding is that my output files for each year will have a unique first Row and the 2nd row will repeat for each pollutant. and from 3rd row the hourly data for all 8760 hours. This need to repeat for the next pollutant.
So far I have managed to create a function that helps me to assign season to each day of the year. For example:
#function to create seasons
d = function(month_day) which(lut$month_day == month_day)
lut = data.frame(all_dates = as.POSIXct("2012-1-1") + ((0:365) * 3600 * 24),
season = NA)
lut = within(lut, { month_day = strftime(all_dates, "%b-%d") })
lut[c(d("Jan-01"):d("Mar-15"), d("Nov-08"):d("Dec-31")), "season"] = "winter"
lut[c(d("Mar-16"):d("Apr-30")), "season"] = "spring"
lut[c(d("May-01"):d("Sep-27")), "season"] = "summer"
lut[c(d("Sep-28"):d("Nov-07")), "season"] = "autumn"
rownames(lut) = lut$month_day
## create date data frame and assign seasons
dates = data.frame(dates =seq(as.Date('2010-01-01'),as.Date('2012-12-31'),by = 1))
dates = within(dates, {
season = lut[strftime(dates, "%b-%d"), "season"]
})
This gives me a dates data frame and my other 2 samples data frames are (as shown in the image):
structure(list(`Source no` = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Source = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"),
Period = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Day = structure(c(2L,
6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L,
7L, 5L, 1L, 3L, 4L), .Label = c("Fri", "Mon", "Sat", "Sun",
"Thu", "Tue", "Wed"), class = "factor"), `Spring On` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L,
15L, 15L, 15L, 15L, 15L, 15L), `Spring Off` = c(23L, 23L,
23L, 23L, 23L, 23L, 23L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 18L,
18L, 18L, 18L, 18L, 18L, 18L), `Summer On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Summer Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Winter On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "off"), class = "factor"),
`Winter Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("23",
"off"), class = "factor")), .Names = c("Source no", "Source",
"Period", "Day", "Spring On", "Spring Off", "Summer On", "Summer Off",
"Autumn On", "Autumn Off", "Winter On", "Winter Off"), class = "data.frame", row.names = c(NA,
-21L)) -> profile
structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2"
), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L),
TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L,
999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L,
328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39,
33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY",
"VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP",
"STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA,
-2L)) -> stack_info
If anyone could give me any guidance of how I can proceed with the programming part would be really useful as I am just not sure how I can approach this to create separate output files as data frame for year 2010, 2011 and 2012.
The way your data is organised isn't ideal for processing. Maybe you have a look at Hadley Wickhams papar about tidy data.
According to your desired output you need a dataframe with the number of lines equal to the number of hours a specific machine (stack n) is switched on. Therefore I suggest you create a dataframe containing every hour of a given year:
d.out = data.frame(dates = seq(from=as.POSIXct("2010-01-01"), by=3600, to= as.POSIXct("2010-12-31")))
d.out$year = as.numeric(format(d.out$dates, "%Y"))
d.out$month = as.numeric(format(d.out$dates, "%m"))
d.out$day = as.numeric(format(d.out$dates, "%d"))
d.out$hour = as.numeric(format(d.out$dates, "%H"))
d.out$weekday = as.character(format(d.out$dates, "%a"))
d.out$doj = as.numeric(format(d.out$dates, "%j"))
d.out$season = "Winter"
d.out$season[d.out$doj >= 75 & d.out$doj < 121] = "Spring"
d.out$season[d.out$doj >= 121 & d.out$doj < 271] = "Summer"
d.out$season[d.out$doj >= 271 & d.out$doj < 312] = "Autumn"
The goal is to join this dataframe with your profile dataframe. Before joining, the profile-df has to be rearranged:
library(dplyr)
library(tidyr)
profile_new =
profile %>%
gather(season, hour, -c(`Source no`, Source, Period, Day)) %>%
extract(season, c("season", "status"), "(\\w+?)\\s(\\w+)") %>%
filter(hour != "off") %>%
mutate(Day = as.character(Day), hour=as.numeric(hour)) %>%
spread(status, hour)
Now it's easy to join the three dataframes to put together all the information you need to create your output:
d.out %>%
inner_join(profile_new, by=c("weekday"="Day", "season"="season")) %>%
group_by(Source, dates, year, day, weekday, season, hour) %>%
summarise(status = any(hour >= On & hour <= Off)) %>%
inner_join(stack_info, by=c("Source"="SNAME")) %>%
mutate(Nox = ifelse(status, Nox, 0),
Sox = ifelse(status, Sox, 0)) %>%
arrange(Source, year, dates, hour) %>%
select(Source, year, day, weekday, season, hour, `EXIT VEL`, TEMP, `STACK DIAM`, W, Nox, Sox)
Obviously it's not quite the format you posted. From here you could write your dataframe to a csv (stack by stack by using append = TRUE).
I have trouble figuring out how I can compute a simple mean with dplyr on Long Format data.
My data look like this :
hldid idno sex diary age
1 1294 1294_1 2 1 39
2 1294 1294_1 2 2 39
3 1294 1294_2 1 1 43
4 1294 1294_2 1 2 43
...
With 4 variables : hldid idno sex diary age
idno is the personal identifier but not the unique key.
Each individual is repeated 2 times, one for each diary filled.
What I would like is to simply compute the age mean by sex.
Could you help me out ?
I tried something like :
dta %>%
group_by(sex) %>%
mutate( ng = n_distinct(idno)) %>%
group_by(age, add=TRUE) %>%
summarise(mean = n()/ng[1] )
But it does not work.
The data :
dta = structure(list(hldid = c(1294, 1294, 1294, 1294, 1352, 1352,
1352, 1352, 3741, 3741, 3741, 3741, 3809, 3809, 3809, 3809, 4037,
4037, 4037, 4037), idno = c("1294_1", "1294_1", "1294_2", "1294_2",
"1352_1", "1352_1", "1352_2", "1352_2", "3741_1", "3741_1", "3741_2",
"3741_2", "3809_1", "3809_1", "3809_2", "3809_2", "4037_1", "4037_1",
"4037_2", "4037_2"), sex = c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), diary = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L), age = c(39L, 39L, 43L, 43L, 31L, 31L, 37L, 37L,
33L, 33L, 37L, 37L, 34L, 34L, 37L, 37L, 41L, 41L, 32L, 32L)), .Names = c("hldid",
"idno", "sex", "diary", "age"), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L), vars = list(hldid), drop = TRUE, indices = list(
0:3, 4:7, 8:11, 12:15, 16:19), group_sizes = c(4L, 4L, 4L,
4L, 4L), biggest_group_size = 4L, labels = structure(list(hldid = c(1294,
1352, 3741, 3809, 4037)), class = "data.frame", row.names = c(NA,
-5L), .Names = "hldid", vars = list(hldid)))
quick update
Maybe this does not apply for this example,
but this kind of issues I have in mind is the following :
Imagine we have data like this :
3 women and 2 men, and a dummy act variable.
If we do and not taking into account the Long format computing the mean, we will have problems.
aggregate(act ~ sex, FUN = mean, data = dtaTime)
What we should do is this :
aggregate(act ~ sex, FUN = sum, data = dtaTime)
6 / 2 # men
10 / 3 # women
Data
dtaTime = structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
sex = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), act = c(1L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L)), .Names = c("id", "sex",
"act"), class = "data.frame", row.names = c(NA, -25L))
You are making it too complicated,
dta %>%
group_by(sex) %>%
summarise(meanage = mean(age))
should give you the mean age by sex.
A base R alternative:
aggregate(age ~ sex, dta, mean)
A data.table alternative:
library(data.table)
setDT(dta)[, .(meanage = mean(age)), by = sex]