I'm trying to display the flags for each country. But it seems most of the flags are incorrect according to my code. I want them to be displayed only at the end of the geom line, add the flag to the legend, and make the flag more visible than how it displays now.
categ_top10EnergyModf %>%
mutate(country = tolower(country)) %>%
ggplot(aes(x= year, y=ggwt_hours, country = country, color=country, group=country))+
geom_line(size=1.5)+
geom_point(size=3)+
geom_flag(aes(country = factor(country), size = 4))+
scale_y_continuous(labels = scales::comma)+
facet_wrap(~type2,scale='free')+
labs(x= "Year", y= "Energy Production (GWh)", title = "Analysis of the Growth of Renewable/Non-Renewable Energy Production",
color="Country",fill = "country" )+
scale_color_discrete(name = "Country",
labels= c("Germany",
"Spain",
"France",
"Italy",
"Norway",
"Poland",
"Sweden",
"Turkey",
"Ukraine",
"United Kingdom")
)+
theme_grey() +
theme(plot.title = element_text(hjust = 0.5))
> dput(head(categ_top10EnergyModf))
structure(list(country = c("de", "de", "fr", "fr", "fr", "de"
), country_name = c("Germany", "Germany", "France", "France",
"France", "Germany"), type2 = c("Non-Renewable", "Non-Renewable",
"Non-Renewable", "Non-Renewable", "Non-Renewable", "Non-Renewable"
), year = structure(c(1L, 2L, 2L, 3L, 1L, 3L), .Label = c("2016",
"2017", "2018"), class = "factor"), ggwt_hours = c(471984, 449906,
448690.614, 447109.694, 445175.494, 393234.585)), row.names = c(NA,
-6L), groups = structure(list(country = c("de", "de", "de", "fr",
"fr", "fr"), country_name = c("Germany", "Germany", "Germany",
"France", "France", "France"), year = structure(c(1L, 2L, 3L,
1L, 2L, 3L), .Label = c("2016", "2017", "2018"), class = "factor"),
.rows = structure(list(1L, 2L, 6L, 5L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You could display a flag as the last point:
geom_flag(aes(x = ifelse(year == 2018, year, NA)), size = 4)
I also set the size not as an aesthetic.
You dont have to use factor(country). You defined your country aesthetic in ggplot already.
I am writing a model for my dataset. Once the bake() is executed, the result has one missing predictor and the outcome.
This happens after writing the recipe steps. Is there any way to resolve this issue?
top10_renewableEnergyProd_split <- initial_split(top10_renewableEnergyProd)
top10_renewableEnergyProd_train <- training(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_test <- testing(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_recipe <- recipe(energyProd_2018 ~ country_name + energyProd_2016 + energyProd_2017 , data = top10_renewableEnergyProd_train)
#recipe steps
top10_renewableEnergyProd_recipe <- top10_renewableEnergyProd_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric())
top10_renewableEnergyProd_prep <- prep(top10_renewableEnergyProd_recipe, training = top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake <- bake(top10_renewableEnergyProd_prep, top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake
> dput(top10_renewableEnergyProd)
structure(list(type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), country = c("DE", "ES", "FR", "IT",
"NO", "PL", "SE", "TR", "UA", "UK"), country_name = c("Germany",
"Spain", "France", "Italy", "Norway", "Poland", "Sweden", "Turkey",
"Ukraine", "United Kingdom"), energyProd_2016 = c(147622, 103353,
99885.054, 90756.826, 146557, 15468, 77505, 87090, 12097, 58909.047
), energyProd_2017 = c(175063, 84664, 93907.184, 86786.294, 146285,
18187.708, 82540, 83536.342, 12082.6, 73113.964), energyProd_2018 = c(185226.211,
99725.566, 113658.177, 96820, 146878.825, 15541.473, 77615.947,
93425.906, 13843.9, 79955.967)), row.names = c(NA, -10L), groups = structure(list(
country = c("DE", "ES", "FR", "IT", "NO", "PL", "SE", "TR",
"UA", "UK"), country_name = c("Germany", "Spain", "France",
"Italy", "Norway", "Poland", "Sweden", "Turkey", "Ukraine",
"United Kingdom"), type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), .rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
> dput(top10_renewableEnergyProd_bake)
structure(list(country_name = structure(c(2L, 5L, 1L, 3L, 4L,
6L, 7L, 8L), .Label = c("France", "Germany", "Italy", "Norway",
"Spain", "Sweden", "Turkey", "United Kingdom"), class = "factor"),
energyProd_2016 = c(1.47285970883518, 0.0604065991667829,
-0.0502421869332435, -0.341488748282317, 1.4388796649716,
-0.764303423167326, -0.458483028395103, -1.35762858619557
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))
Replicating a visualization I saw in print media using ggplot2
Context:
I am always looking to make data visualizations more appealing/aesthetic specifically for non-data people, who are the majority of people I work with (stakeholders like marketers, management, etc) -- I've noted that when visualizations look like academic-publication-quality (standard ggplot2 aesthetics) they tend to assume they can't understand it and don't bother trying, defeating the whole purpose of visualizations in the first place. However, when it looks more graphic'y (like something you may see on websites or marketing material) they focus and try to understand the visualization, usually successfully. Often we'll end up in the most interesting discussions from these types of visualizations, so that is my ultimate goal.
The Visualization:
Here is something I saw on some marketing brochure on the device share of web traffic by geo, and though it is actually a bit busy and unclear, it resonated better than a similar stacked bar chart I created in standard -- I have not the slightest idea how I might replicate something like this within ggplot2, any attempts would be much appreciated! Here is some sample tidy data to use in a data.table:
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
You could also consider googleVis
library(googleVis)
dat <- structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
link_order <- unique(dat$country)
node_order <- unique(as.vector(rbind(dat$country, as.character(dat$device_type))))
link_cols <- data.frame(color = c('#ffd1ab', '#ff8d14', '#ff717e', '#dd2c40', '#d6b0ea',
'#8c4fab','#00addb','#297cbe'),
country = c("UK", "Canada", "USA", "China", "Spain", "Japan", "Argentina", "Brazil"),
stringsAsFactors = F)
node_cols <- data.frame(color = c("#ffc796", "#ff7100", "#ff485b", "#d20000",
"#cc98e6", "#6f2296", "#009bd2", "#005daf",
"grey", "grey", "grey"),
type = c("UK", "Canada", "USA", "China", "Spain", "Japan",
"Argentina", "Brazil", "multi", "desktop", "mobile"))
link_cols2 <- sapply(link_order, function(x) link_cols[x == link_cols$country, "color"])
node_cols2 <- sapply(node_order, function(x) node_cols[x == node_cols$type, "color"])
actual_link_cols <- paste0("[", paste0("'", link_cols2,"'", collapse = ','), "]")
actual_node_cols <- paste0("[", paste0("'", node_cols2,"'", collapse = ','), "]")
opts <- paste0("{
link: { colorMode: 'source',
colors: ", actual_link_cols ," },
node: {colors: ", actual_node_cols ,"}}")
Sankey <- gvisSankey(dat,
from = "country",
to = "device_type",
weight = "proportion",
options = list(height = 500, width = 1000, sankey = opts))
plot(Sankey)
You can try with "ggalluvial" package and its respective "geom".
Chek this out
I want to replace the , with a . as the decimal point in a dataframe. I can do this with df$X2005 <- as.numeric(gsub(',', '.', df$X2005)) for each variable. Is there a more efficient way to do this for the whole dataframe at once?
Some example data:
df <- structure(list(country = structure(1:6, .Label = c("Australia", "Austria", "Belgium", "Canada", "Chile", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Korea", "Luxembourg", "Mexico", "Netherlands", "New Zealand", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "United Kingdom", "United States"), class = "factor"), X2005 = structure(c(26L, 2L, 34L, 33L, 13L, 14L), .Label = c("10,3533", "10,4187", "10,8089", "10,8629", "10,882", "11,0173", "15,8399", "5,0226", "5,4488", "5,6273", "5,8713", "6,2137", "6,6397", "6,9339", "7,0448", "7,5719", "7,8534", "7,9457", "8,1819", "8,2668", "8,2883", "8,3556", "8,394", "8,4295", "8,4456", "8,4794", "8,7437", "9,0304", "9,0615", "9,4427", "9,6618", "9,77", "9,8295", "9,9833"), class = "factor"), X2006 = structure(c(25L, 2L, 31L, 34L, 13L, 14L), .Label = c("10,0326", "10,2177", "10,3877", "10,6374", "10,7468", "10,9516", "15,9368", "5,0169", "5,6845", "5,8109", "6,1019", "6,2008", "6,285", "6,6937", "7,3477", "7,5148", "7,5836", "7,7495", "8,1986", "8,2586", "8,2807", "8,3448", "8,39", "8,4289", "8,5204", "8,564", "8,8247", "8,8401", "8,948", "9,1292", "9,4811", "9,7487", "9,9243", "9,9621"), class = "factor"), X2007 = structure(c(27L, 3L, 31L, 1L, 14L, 13L), .Label = c("10,0263", "10,2099", "10,2617", "10,4771", "10,7642", "10,8754", "16,1608", "5,1597", "5,7779", "6,0372", "6,3331", "6,3858", "6,5223", "6,5494", "7,1288", "7,6299", "7,6744", "7,7553", "7,8565", "7,9023", "8,043", "8,2295", "8,4769", "8,4908", "8,5014", "8,504", "8,5531", "8,746", "8,9172", "9,0913", "9,5254", "9,8104", "9,9873", "9,9942"), class = "factor"), X2008 = structure(c(26L, 6L, 34L, 4L, 17L, 15L), .Label = c("10,1268", "10,183", "10,2189", "10,2537", "10,289", "10,4896", "10,7042", "10,9909", "11,0232", "16,6201", "5,8474", "6,0577", "6,0745", "6,586", "6,8189", "6,8863", "7,1361", "7,1819", "7,4631", "7,7052", "8,0208", "8,3068", "8,3457", "8,5513", "8,605", "8,751", "8,8915", "8,9402", "8,9521", "9,0591", "9,1344", "9,2284", "9,3051", "9,9128"), class = "factor"), X2009 = structure(c(24L, 8L, 5L, 9L, 21L, 22L), .Label = c("", "10,0115", "10,0496", "10,1957", "10,5938", "10,8137", "11,0005", "11,1729", "11,3992", "11,4722", "11,7314", "11,7516", "11,8823", "17,6706", "6,4098", "7,039", "7,1018", "7,2127", "7,6797", "7,7356", "7,8649", "7,9514", "7,9657", "9,0423", "9,152", "9,17", "9,1947", "9,4037", "9,5258", "9,6247", "9,636", "9,6743", "9,9056", "9,939"), class = "factor"), X2010 = structure(c(23L, 6L, 3L, 8L, 18L, 19L), .Label = c("", "10,1995", "10,503", "10,797", "10,8817", "11,0318", "11,0751", "11,3738", "11,5495", "11,677", "12,0661", "17,6911", "6,1782", "6,3394", "7,0229", "7,1675", "7,2911", "7,37", "7,4319", "7,6856", "8,0302", "8,8718", "8,9481", "8,9888", "8,995", "9,2925", "9,312", "9,4079", "9,4224", "9,4688", "9,5277", "9,5504", "9,589", "9,6074"), class = "factor"), X2011 = structure(c(NA, 5L, 4L, 8L, 18L, 17L), .Label = c("", "10,2345", "10,2844", "10,5139", "10,7769", "10,8671", "11,0148", "11,1784", "11,3323", "11,6343", "11,9369", "17,683", "5,922", "6,6464", "6,8709", "7,3692", "7,5011", "7,5206", "7,7333", "7,8877", "7,9416", "8,8501", "8,9042", "9,0019", "9,027", "9,1296", "9,2256", "9,2837", "9,2969", "9,4184", "9,4661"), class = "factor"), X2012 = structure(c(NA, NA, NA, 2L, 5L, NA), .Label = c("", "11,2132", "11,2955", "7,5249", "7,6077", "7,8226", "8,7596", "8,923", "9,148", "9,167", "9,3722"), class = "factor")), .Names = c("country", "X2005", "X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012"), row.names = c(NA, 6L), class = "data.frame")
You can instead read your data in the correct format.
If you use read.table for instance, change the dec = "," parameter. You'll probably need to change the na.strings = "<NA>" parameter as well.
You could use a for loop:
for (col in names(df[,-1])) {
df[,col] <- as.numeric(gsub(",", ".", df[,col]))
}