rearranging ggplot axis groups in plot - r
I am trying to rearrange my plot such that it follows the correct confusion matrix structure from the caret package.
For example, I have the following confusion matrix table:
Prediction 0 1
0 3444 265
1 98 148
Where the 1 is the bankrupt case and the 0 is the Non-Bankrupt case in my data. I seem to have my data the opposite of what I want it to look like.
Other confusion matrix:
Reference
Prediction 0 1
0 3317 313
1 87 164
Reference
Prediction 0 1
0 3079 308
1 78 182
Reference
Prediction 0 1
0 2980 335
1 106 144
I want to rearrange the plots such at (for the first confusion matrix) the 3444 number and points are in the upper quadrant with the text "Non-Bankrupt" next to it (which is the 0 in the confusion matrix). The 148 should be in the bottom right quadrant. The 265 should be in the upper right quadrant and the 98 should be in the bottom left quadrant. (Just as displayed in the first confusion matrix)
Code:
d %>%
ggplot(aes(x = pred_status, y = status, color = correct)) +
geom_jitter() +
geom_text(aes(label = n), size = 15, color = "black", family = "serif") +
facet_wrap(~Model_Name) +
scale_x_discrete(labels = c("Non-Bankrupt", "Bankrupt")) +
scale_y_discrete(labels = c("Non-Bankrupt", "Bankrupt"))
Sample of the data:
d <- structure(list(pred_status = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), status = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
correct = structure(c(1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Correct",
"Incorrect"), class = "factor"), n = c(2980L, 2980L, 313L,
3444L, 2980L, 3444L, 3444L, 2980L, 3317L, 2980L, 3317L, 3317L,
182L, 3317L, 3079L, 2980L, 3444L, 3444L, 3317L, 3079L, 3079L,
144L, 3444L, 3444L, 3079L, 2980L, 2980L, 3444L, 3444L, 3317L,
3444L, 3444L, 2980L, 3444L, 3317L, 3079L, 3079L, 3079L, 3444L,
2980L, 3317L, 3444L, 3444L, 3317L, 144L, 3079L, 2980L, 2980L,
3317L, 3079L, 182L, 335L, 3444L, 3317L, 3444L, 3444L, 3444L,
106L, 3079L, 3317L, 3079L, 3079L, 3444L, 3444L, 3079L, 164L,
3444L, 2980L, 3079L, 2980L, 3079L, 164L, 3317L, 3317L, 2980L,
148L, 313L, 3444L, 3079L, 3317L, 3444L, 3079L, 3444L, 3317L,
3444L, 3444L, 3079L, 3444L, 3317L, 3079L, 144L, 3079L, 3317L,
3317L, 3079L, 3317L, 3444L, 2980L, 3317L, 2980L, 3444L, 2980L,
3079L, 3079L, 3444L, 3444L, 2980L, 3317L, 3317L, 2980L, 3079L,
98L, 3079L, 3444L, 3444L, 2980L, 3317L, 3079L, 3444L, 2980L,
3079L, 3317L, 144L, 182L, 3317L, 3079L, 2980L, 3079L, 3444L,
313L, 2980L, 3317L, 3444L, 3317L, 3317L, 3079L, 2980L, 3444L,
182L, 3079L, 3317L, 3444L, 265L, 164L, 335L, 3079L, 2980L,
2980L, 3444L, 3079L, 3444L, 2980L, 3317L, 148L, 3444L, 2980L,
3079L, 3444L, 3079L, 2980L, 3317L, 3444L, 2980L, 98L, 3317L,
78L, 3317L, 2980L, 3079L, 106L, 3079L, 3079L, 3079L, 2980L,
2980L, 3317L, 2980L, 3444L, 3444L, 3079L, 2980L, 3444L, 3079L,
2980L, 3317L, 3317L, 164L, 308L, 3317L, 3444L, 313L, 3444L,
3317L, 3444L, 2980L, 3317L, 3317L, 3444L, 2980L, 3444L),
Model_Name = c("4 Year", "4 Year", "2 Year", "1 Year", "4 Year",
"1 Year", "1 Year", "4 Year", "2 Year", "4 Year", "2 Year",
"2 Year", "3 Year", "2 Year", "3 Year", "4 Year", "1 Year",
"1 Year", "2 Year", "3 Year", "3 Year", "4 Year", "1 Year",
"1 Year", "3 Year", "4 Year", "4 Year", "1 Year", "1 Year",
"2 Year", "1 Year", "1 Year", "4 Year", "1 Year", "2 Year",
"3 Year", "3 Year", "3 Year", "1 Year", "4 Year", "2 Year",
"1 Year", "1 Year", "2 Year", "4 Year", "3 Year", "4 Year",
"4 Year", "2 Year", "3 Year", "3 Year", "4 Year", "1 Year",
"2 Year", "1 Year", "1 Year", "1 Year", "4 Year", "3 Year",
"2 Year", "3 Year", "3 Year", "1 Year", "1 Year", "3 Year",
"2 Year", "1 Year", "4 Year", "3 Year", "4 Year", "3 Year",
"2 Year", "2 Year", "2 Year", "4 Year", "1 Year", "2 Year",
"1 Year", "3 Year", "2 Year", "1 Year", "3 Year", "1 Year",
"2 Year", "1 Year", "1 Year", "3 Year", "1 Year", "2 Year",
"3 Year", "4 Year", "3 Year", "2 Year", "2 Year", "3 Year",
"2 Year", "1 Year", "4 Year", "2 Year", "4 Year", "1 Year",
"4 Year", "3 Year", "3 Year", "1 Year", "1 Year", "4 Year",
"2 Year", "2 Year", "4 Year", "3 Year", "1 Year", "3 Year",
"1 Year", "1 Year", "4 Year", "2 Year", "3 Year", "1 Year",
"4 Year", "3 Year", "2 Year", "4 Year", "3 Year", "2 Year",
"3 Year", "4 Year", "3 Year", "1 Year", "2 Year", "4 Year",
"2 Year", "1 Year", "2 Year", "2 Year", "3 Year", "4 Year",
"1 Year", "3 Year", "3 Year", "2 Year", "1 Year", "1 Year",
"2 Year", "4 Year", "3 Year", "4 Year", "4 Year", "1 Year",
"3 Year", "1 Year", "4 Year", "2 Year", "1 Year", "1 Year",
"4 Year", "3 Year", "1 Year", "3 Year", "4 Year", "2 Year",
"1 Year", "4 Year", "1 Year", "2 Year", "3 Year", "2 Year",
"4 Year", "3 Year", "4 Year", "3 Year", "3 Year", "3 Year",
"4 Year", "4 Year", "2 Year", "4 Year", "1 Year", "1 Year",
"3 Year", "4 Year", "1 Year", "3 Year", "4 Year", "2 Year",
"2 Year", "2 Year", "3 Year", "2 Year", "1 Year", "2 Year",
"1 Year", "2 Year", "1 Year", "4 Year", "2 Year", "2 Year",
"1 Year", "4 Year", "1 Year")), row.names = c(NA, -200L), class = c("tbl_df",
"tbl", "data.frame"))
I tried and came up with this minor changes:
library(tidyverse)
d %>%
ggplot(aes(x = status, y = fct_rev(pred_status), color = correct)) +
geom_jitter() +
geom_text(aes(label = n), size = 15, color = "black", family = "serif") +
facet_wrap(~Model_Name) +
scale_x_discrete(labels = c("Non-Bankrupt", "Bankrupt")) +
scale_y_discrete(labels = c("Bankrupt", "Non-Bankrupt"))
Which gives me this figure:
Not 100% sure if the labels are correct now?รจ
Related
xAxis order of R highcharter column plot
With the following data frame: dta <- structure(list(sociodemographic_var = structure(c(3L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 26L, 18L, 20L, 21L, 26L, 13L, 16L, 21L, 22L, 26L, 26L, 9L, 13L, 17L, 18L, 20L, 21L, 23L, 26L, 20L, 26L), levels = c("1st grade", "2nd grade", "3rd grade", "4th grade", "5th grade", "6th grade", "7th grade", "8th grade", "9th grade", "10th grade", "11th grade", "12th grade, no diploma", "High school graduate", "GED or equivalent", "Some college, no degree", "Less than 1 year of college credit/post-secondary education (or less than 10 classes)", "One year or more of college credit, no degree", "Associate degree: Occupational, Technical, or Vocational", "Associate degree: Academic Program", "Bachelor's degree (ex. BA, AB, BS, BBS)", "Master's degree (ex. MA, MS, MEng, MEd, MBA)", "Professional School degree (ex. MD, DDS, DVN, JD)", "Doctoral degree (ex. PhD, EdD)", "Refused to answer", "Don't Know", "unknown"), class = "factor"), event = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 11L, 11L), levels = c("Baseline", "0.5 Year", "1 Year", "1.5 Year", "2 Year", "2.5 Year", "3 Year", "3.5 Year", "4 Year", "4.5 Year", "5 Year", "5.5 Year", "6 Year", "Screener"), class = "factor"), visit_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("on-site", "hybrid", "remote", "unknown"), class = "factor"), n = c(2L, 13L, 5L, 9L, 15L, 18L, 26L, 25L, 192L, 27L, 485L, 224L, 183L, 1011L, 666L, 55L, 78L, 3L, 9L, 1L, 1L, 2L, 208L, 1L, 1L, 1L, 1L, 126L, 28L, 1L, 1L, 2L, 2L, 3L, 4L, 1L, 543L, 1L, 300L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -39L)) I would assume that, generating a highcharter bar plot with: library(highcharter) # v0.9.4 dta |> hchart(type = "column", hcaes(x = "event", y = "n", group = "sociodemographic_var")) |> hc_yAxis(title = list(text = "%"), max = 115, endOnTick = FALSE, stackLabels = list(enabled = TRUE)) |> hc_xAxis(title = "") |> hc_plotOptions(series = list(stacking = "percent")) the xAxis would be ordered by levels(dta$event): levels(dta$event) [1] "Baseline" "0.5 Year" "1 Year" "1.5 Year" "2 Year" "2.5 Year" "3 Year" "3.5 Year" "4 Year" "4.5 Year" "5 Year" "5.5 Year" [13] "6 Year" "Screener" But the ordering is different and neither alphabetical nor based on the total number of values: I am interested to understand why it's the case and how to set the order right.
You can add categories to your hc_xAxis to make an order like this: library(highcharter) dta |> hchart(type = "column", hcaes(x = "event", y = "n", group = "sociodemographic_var")) |> hc_yAxis(title = list(text = "%"), max = 115, endOnTick = FALSE, stackLabels = list(enabled = TRUE)) |> hc_xAxis(title = "", categories = levels(dta$event)) |> hc_plotOptions(series = list(stacking = "percent")) Output:
Regex for 11, 12 etc but not 1
I'm trying to filter a data set, and only keep Scenarios 11, 12, 13, 14 etc (but not Scenario 1). My input looks like this: structure(list(Title.1 = structure(c(1L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 2L, 3L, 4L, 5L, 6L, 1L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 2L, 3L, 4L, 5L, 6L), .Label = c("Scenario 1", "Scenario 10", "Scenario 11", "Scenario 12", "Scenario 13", "Scenario 14", "Scenario 2", "Scenario 3", "Scenario 4", "Scenario 5", "Scenario 6", "Scenario 7", "Scenario 8", "Scenario 9"), class = "factor"), Color = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Blue", "Red"), class = "factor")), class = "data.frame", row.names = c(NA, -28L)) and my output would ideally look like this: structure(list(Title.1 = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Scenario 10", "Scenario 11", "Scenario 12", "Scenario 13", "Scenario 14"), class = "factor"), Color = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Blue"), class = "factor")), class = "data.frame", row.names = c(NA, -28L)) I could define what I'm keeping (ie. keep = c('Scenario 11', 'Scenario 12' etc) and then filter it by "Title" and then filter it again by 'Color', but I'm sure I could do it in one line with a Regex. My issue is that I can't seem to tell it to ignore Scenario 1. Could someone please point me in the right direction?
If you want to do this with regex we can use grepl to select rows which have "1" followed by another digit along with Color = 'Blue'. subset(df, grepl('\\b1\\d\\b', Title.1) & Color == 'Blue') # Title.1 Color #24 Scenario 10 Blue #25 Scenario 11 Blue #26 Scenario 12 Blue #27 Scenario 13 Blue #28 Scenario 14 Blue
We can use filter with str_detect from tidyverse library(dplyr) library(stringr) df %>% filter(str_detect(Title.1, '\\b1\\d\\b') & Color == 'Blue')
Bar chart showing NA bar when there are no NA values
My visualisation is showing an NA bar chart despite the fact that I have imputed all NA values in my incomeLev column and explicitly removed all NA values from the mental health (which is in my stacked bar visualisation) brfss2013$mentalHealth <- forcats::fct_explicit_na(brfss2013$mentalHealth, na_level = "Missing") brfss2013$incomeLev <- as.factor(brfss2013$incomeLev) brfss2013 <- subset(brfss2013, !is.na(incomeLev)) brfss2013 %>% add_count(incomeLev) %>% rename(count_inc = n) %>% count(incomeLev, mentalHealth, count_inc) %>% rename(count_mentalHealth = n) %>% mutate(percent= count_mentalHealth / count_inc) %>% mutate(incomeLev = factor(incomeLev, levels=c('0-$20k','25-$35k','35-$50k','50-$75k','>$75k')))%>% ggplot(aes(x= incomeLev, y= count_mentalHealth, group= mentalHealth)) + xlab('Annual Income')+ylab('Number of People')+ geom_bar(aes(fill=mentalHealth), stat="identity",na.rm=TRUE)+ # Using the scales package does the percent formatting for you geom_text(aes(label = scales::percent(percent)),position = position_stack(vjust = 0.5))+ theme_minimal() Here is a sample of my data: brfss2013<-structure(list(incomeLev = structure(c(5L, 1L, 1L, 5L, 4L, 1L, 1L, 4L, 1L, 3L), .Label = c(">$75k", "0-$20k", "25-$35k", "35-$50k", "50-$75"), class = "factor"), healtheat = c(4.66, 1.68, 2.37, 1.85, 2.5, 3, 3.66, 4.27, 2.72, 1.72), X_age_g = structure(c(5L, 4L, 5L, 5L, 6L, 4L, 3L, 5L, 4L, 6L), .Label = c("Age 18 to 24", "Age 25 to 34", "Age 35 to 44", "Age 45 to 54", "Age 55 to 64", "Age 65 or older"), class = "factor"), employ1 = structure(c(7L, 1L, 1L, 7L, 7L, 1L, 1L, 7L, 7L, 5L), .Label = c("Employed for wages", "Self-employed", "Out of work for 1 year or more", "Out of work for less than 1 year", "A homemaker", "A student", "Retired", "Unable to work"), class = "factor"), renthom1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("Own", "Rent", "Other arrangement"), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("Male", "Female"), class = "factor"), physLev = structure(c(3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L, 2L), .Label = c("0-200", "200-500", "500-1000", "1000-2000", "2000-4000", "4000-10000", ">10000" ), class = "factor"), mentalHealth = structure(c(5L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L), .Label = c("Excellent", "Good", "Ok", "Bad", "Very Bad", "Missing"), class = "factor")), row.names = c(NA, 10L), class = "data.frame")
Creating output based on multiple criteria
I have 2 data frames for 2 stacks that gives information about potential emission. One data frame gives the time frame of what hours the system turn on and off for 4 seasons. Each season start on specific date. The 2nd file give me the details of the stack. I am trying with some sample file to test how to do this and so far I have managed to create a function following stack overflow example that allow me to create a data frame with the dates that I would like and a column with seasons for each date. I am really struggling now with the programming concept to understand how do I combine the 3 data frames to create the output template that I am trying to set up. To show you an example my sample input are: Stack_info File: example seasonal Profile that shows when the system is on or off: and the output I am after should create data frames for each year in the following format (only the black font and the red text to just explain what the values are): What is the most difficult I am finding is that my output files for each year will have a unique first Row and the 2nd row will repeat for each pollutant. and from 3rd row the hourly data for all 8760 hours. This need to repeat for the next pollutant. So far I have managed to create a function that helps me to assign season to each day of the year. For example: #function to create seasons d = function(month_day) which(lut$month_day == month_day) lut = data.frame(all_dates = as.POSIXct("2012-1-1") + ((0:365) * 3600 * 24), season = NA) lut = within(lut, { month_day = strftime(all_dates, "%b-%d") }) lut[c(d("Jan-01"):d("Mar-15"), d("Nov-08"):d("Dec-31")), "season"] = "winter" lut[c(d("Mar-16"):d("Apr-30")), "season"] = "spring" lut[c(d("May-01"):d("Sep-27")), "season"] = "summer" lut[c(d("Sep-28"):d("Nov-07")), "season"] = "autumn" rownames(lut) = lut$month_day ## create date data frame and assign seasons dates = data.frame(dates =seq(as.Date('2010-01-01'),as.Date('2012-12-31'),by = 1)) dates = within(dates, { season = lut[strftime(dates, "%b-%d"), "season"] }) This gives me a dates data frame and my other 2 samples data frames are (as shown in the image): structure(list(`Source no` = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Source = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"), Period = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Day = structure(c(2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L), .Label = c("Fri", "Mon", "Sat", "Sun", "Thu", "Tue", "Wed"), class = "factor"), `Spring On` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L, 15L, 15L, 15L, 15L, 15L, 15L), `Spring Off` = c(23L, 23L, 23L, 23L, 23L, 23L, 23L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), `Summer On` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Summer Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn On` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Winter On` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "off"), class = "factor"), `Winter Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("23", "off"), class = "factor")), .Names = c("Source no", "Source", "Period", "Day", "Spring On", "Spring Off", "Summer On", "Summer Off", "Autumn On", "Autumn Off", "Winter On", "Winter Off"), class = "data.frame", row.names = c(NA, -21L)) -> profile structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2" ), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L), TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L, 999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L, 328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39, 33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY", "VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP", "STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA, -2L)) -> stack_info If anyone could give me any guidance of how I can proceed with the programming part would be really useful as I am just not sure how I can approach this to create separate output files as data frame for year 2010, 2011 and 2012.
The way your data is organised isn't ideal for processing. Maybe you have a look at Hadley Wickhams papar about tidy data. According to your desired output you need a dataframe with the number of lines equal to the number of hours a specific machine (stack n) is switched on. Therefore I suggest you create a dataframe containing every hour of a given year: d.out = data.frame(dates = seq(from=as.POSIXct("2010-01-01"), by=3600, to= as.POSIXct("2010-12-31"))) d.out$year = as.numeric(format(d.out$dates, "%Y")) d.out$month = as.numeric(format(d.out$dates, "%m")) d.out$day = as.numeric(format(d.out$dates, "%d")) d.out$hour = as.numeric(format(d.out$dates, "%H")) d.out$weekday = as.character(format(d.out$dates, "%a")) d.out$doj = as.numeric(format(d.out$dates, "%j")) d.out$season = "Winter" d.out$season[d.out$doj >= 75 & d.out$doj < 121] = "Spring" d.out$season[d.out$doj >= 121 & d.out$doj < 271] = "Summer" d.out$season[d.out$doj >= 271 & d.out$doj < 312] = "Autumn" The goal is to join this dataframe with your profile dataframe. Before joining, the profile-df has to be rearranged: library(dplyr) library(tidyr) profile_new = profile %>% gather(season, hour, -c(`Source no`, Source, Period, Day)) %>% extract(season, c("season", "status"), "(\\w+?)\\s(\\w+)") %>% filter(hour != "off") %>% mutate(Day = as.character(Day), hour=as.numeric(hour)) %>% spread(status, hour) Now it's easy to join the three dataframes to put together all the information you need to create your output: d.out %>% inner_join(profile_new, by=c("weekday"="Day", "season"="season")) %>% group_by(Source, dates, year, day, weekday, season, hour) %>% summarise(status = any(hour >= On & hour <= Off)) %>% inner_join(stack_info, by=c("Source"="SNAME")) %>% mutate(Nox = ifelse(status, Nox, 0), Sox = ifelse(status, Sox, 0)) %>% arrange(Source, year, dates, hour) %>% select(Source, year, day, weekday, season, hour, `EXIT VEL`, TEMP, `STACK DIAM`, W, Nox, Sox) Obviously it's not quite the format you posted. From here you could write your dataframe to a csv (stack by stack by using append = TRUE).
some data are not showing up on the ggplot2 window
head(x) Region Type Date count 1 Americas Point 2011-10-26 1 2 Americas Point 2011-10-27 2 3 Americas Point 2011-10-31 1 4 Americas Point 2011-11-01 1 5 Americas Point 2011-12-05 1 6 Americas Point 2011-12-07 1 dput(x) structure(list(Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Americas", class = "factor"), Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Point", class = "factor"), Date = structure(c(15273, 15274, 15278, 15279, 15313, 15315, 15316, 15320, 15341, 15342, 15351, 15358, 15370, 15390, 15392, 15405, 15407, 15411, 15418, 15421, 15433, 15467, 15470, 15482, 15495, 15497, 15503, 15517, 15530, 15551, 15554, 15582, 15586, 15589, 15593, 15601, 15602, 15610, 15615, 15616, 15624, 15643, 15645, 15656, 15663, 15664, 15665, 15672, 15673, 15677, 15678, 15679, 15680, 15684, 15686, 15693, 15694, 15698, 15699, 15705, 15706, 15707, 15712, 15713, 15714, 15719, 15720, 15721, 15727, 15736, 15740, 15741, 15742, 15743), class = "Date"), count = c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Region", "Type", "Date", "count"), row.names = c(NA, -74L), class = "data.frame") I am trying to build a stack bar graph as follows: ggplot(x, aes(Date, count, group=Region)) + geom_bar(aes(fill=Type, width=0.3),stat="identity", position="stack") + scale_x_date(breaks = "1 month", minor_breaks = "2 weeks", labels=date_format("%b-%y")) + geom_smooth(method="lm", se=T, size=0.5, colour="yellow") + facet_wrap(~Region) by default, I see some missing points but when I stretched the plot window, points appear. I really need all the points in the chart, other wise it looks like I am miss reporting the data. Any suggestions how can I address this so that I see all the data points on the chart. My window size is 500 by 500.
Indeed, by increasing the screen size more bars appear. You can't see them in the small window of the R console because the width of the bars is too small. But when you save it, the bars can be seen in the output: plot <- ggplot(x, aes(Date, count, group=Region)) + geom_bar(aes(fill=Type, width=0.3),stat="identity", position="stack") + scale_x_date(breaks = "1 month", minor_breaks = "2 weeks") + geom_smooth(method="lm", se=T, size=0.5, colour="yellow") + facet_wrap(~Region) ggsave("test.pdf",plot ) To see all the points in the R console increase the width, for instance: (plot <- ggplot(x, aes(Date, count, group=Region)) + geom_bar(aes(fill=Type, width=1),stat="identity", position="stack") + scale_x_date(breaks = "1 month", minor_breaks = "2 weeks") + geom_smooth(method="lm", se=T, size=0.5, colour="yellow") + facet_wrap(~Region))