How can I make several lines in ggplot with several group layers? - r

I have divided my plots into 2 based on Sportbook and Casino. How is it possible to also split line into several lines (different colors) to show different markets? I tried to use fill=market at the end of ggplot function, however it did not help.
library(ggplot2)
data<-structure(list(wday = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L,
7L, 7L, 7L), .Label = c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"), class = "factor"), market = c("France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland"), product_preference = c("Casino",
"Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook"
), ggr = c(3349.80897892753, 161.917715712988, 17700.4568364611,
-123.342131455399, 17208.7731385281, 3128.51277864992, 2877.17330617787,
28.5162781278127, 13453.7092912371, -82.8980672268908, 13611.1197727273,
9910.32070866143, 3939.20578803854, 126.311590466926, 19097.2664228723,
-94.5491666666667, 16706.9427008929, 2636.63687707641, 3393.43150322119,
176.953280238925, 23414.9515950069, -72.4428986866791, 16140.8680085653,
5618.00758333333, 3007.18322084806, 69.4383454281568, 18018.1755748663,
-77.87698, 19889.0339183673, 5561.69038585209, 4205.12735472371,
-16.0552268431002, 17166.1121932115, -117.149356025759, 18527.8546597938,
6806.36808346213, 3446.70375835385, 56.6674850849013, 18026.2400535475,
-67.3431629701062, 13641.4965135699, 11470.3083969466)), row.names = c(NA,
-42L), groups = structure(list(wday = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L), .Label = c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"), class = "factor"), market = c("France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland"), .rows = structure(list(1:2, 3:4, 5:6, 7:8,
9:10, 11:12, 13:14, 15:16, 17:18, 19:20, 21:22, 23:24, 25:26,
27:28, 29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), na.action = structure(43:46, .Names = c("43",
"44", "45", "46"), class = "omit"), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
ggplot() +
geom_line(data = data,aes(x = wday, y = ggr,group = product_preference))+
facet_grid(.~product_preference,scales="free")

You can define the color of your lines by adding color = <grouping variable>.
Also, you already do a facet grid on product_preference, so there seems to be no need to define group = product_preference.
Try this:
ggplot() +
geom_line(data = data,aes(x = wday, y = ggr,color = market, group = market)) +
facet_grid(.~product_preference,scales="free")

I am not sure this is what you wanted, but I would replace geom_line with 'geom_col'. 'geom_col' also takes the 'fill' attribute, while 'geom_line' would require 'color'.
So my suggestion would be the following:
ggplot() +
geom_col(data = data,aes(x = wday, y = ggr, group = product_preference, fill = market))+
facet_grid(.~product_preference,scales="free")
This results in the following plot:

Related

Country Flags are incorrectly shown in the graph

I'm trying to display the flags for each country. But it seems most of the flags are incorrect according to my code. I want them to be displayed only at the end of the geom line, add the flag to the legend, and make the flag more visible than how it displays now.
categ_top10EnergyModf %>%
mutate(country = tolower(country)) %>%
ggplot(aes(x= year, y=ggwt_hours, country = country, color=country, group=country))+
geom_line(size=1.5)+
geom_point(size=3)+
geom_flag(aes(country = factor(country), size = 4))+
scale_y_continuous(labels = scales::comma)+
facet_wrap(~type2,scale='free')+
labs(x= "Year", y= "Energy Production (GWh)", title = "Analysis of the Growth of Renewable/Non-Renewable Energy Production",
color="Country",fill = "country" )+
scale_color_discrete(name = "Country",
labels= c("Germany",
"Spain",
"France",
"Italy",
"Norway",
"Poland",
"Sweden",
"Turkey",
"Ukraine",
"United Kingdom")
)+
theme_grey() +
theme(plot.title = element_text(hjust = 0.5))
> dput(head(categ_top10EnergyModf))
structure(list(country = c("de", "de", "fr", "fr", "fr", "de"
), country_name = c("Germany", "Germany", "France", "France",
"France", "Germany"), type2 = c("Non-Renewable", "Non-Renewable",
"Non-Renewable", "Non-Renewable", "Non-Renewable", "Non-Renewable"
), year = structure(c(1L, 2L, 2L, 3L, 1L, 3L), .Label = c("2016",
"2017", "2018"), class = "factor"), ggwt_hours = c(471984, 449906,
448690.614, 447109.694, 445175.494, 393234.585)), row.names = c(NA,
-6L), groups = structure(list(country = c("de", "de", "de", "fr",
"fr", "fr"), country_name = c("Germany", "Germany", "Germany",
"France", "France", "France"), year = structure(c(1L, 2L, 3L,
1L, 2L, 3L), .Label = c("2016", "2017", "2018"), class = "factor"),
.rows = structure(list(1L, 2L, 6L, 5L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You could display a flag as the last point:
geom_flag(aes(x = ifelse(year == 2018, year, NA)), size = 4)
I also set the size not as an aesthetic.
You dont have to use factor(country). You defined your country aesthetic in ggplot already.

After executing the bake(), it doesn't show all the predictors and the outcome in the result

I am writing a model for my dataset. Once the bake() is executed, the result has one missing predictor and the outcome.
This happens after writing the recipe steps. Is there any way to resolve this issue?
top10_renewableEnergyProd_split <- initial_split(top10_renewableEnergyProd)
top10_renewableEnergyProd_train <- training(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_test <- testing(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_recipe <- recipe(energyProd_2018 ~ country_name + energyProd_2016 + energyProd_2017 , data = top10_renewableEnergyProd_train)
#recipe steps
top10_renewableEnergyProd_recipe <- top10_renewableEnergyProd_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric())
top10_renewableEnergyProd_prep <- prep(top10_renewableEnergyProd_recipe, training = top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake <- bake(top10_renewableEnergyProd_prep, top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake
> dput(top10_renewableEnergyProd)
structure(list(type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), country = c("DE", "ES", "FR", "IT",
"NO", "PL", "SE", "TR", "UA", "UK"), country_name = c("Germany",
"Spain", "France", "Italy", "Norway", "Poland", "Sweden", "Turkey",
"Ukraine", "United Kingdom"), energyProd_2016 = c(147622, 103353,
99885.054, 90756.826, 146557, 15468, 77505, 87090, 12097, 58909.047
), energyProd_2017 = c(175063, 84664, 93907.184, 86786.294, 146285,
18187.708, 82540, 83536.342, 12082.6, 73113.964), energyProd_2018 = c(185226.211,
99725.566, 113658.177, 96820, 146878.825, 15541.473, 77615.947,
93425.906, 13843.9, 79955.967)), row.names = c(NA, -10L), groups = structure(list(
country = c("DE", "ES", "FR", "IT", "NO", "PL", "SE", "TR",
"UA", "UK"), country_name = c("Germany", "Spain", "France",
"Italy", "Norway", "Poland", "Sweden", "Turkey", "Ukraine",
"United Kingdom"), type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), .rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
> dput(top10_renewableEnergyProd_bake)
structure(list(country_name = structure(c(2L, 5L, 1L, 3L, 4L,
6L, 7L, 8L), .Label = c("France", "Germany", "Italy", "Norway",
"Spain", "Sweden", "Turkey", "United Kingdom"), class = "factor"),
energyProd_2016 = c(1.47285970883518, 0.0604065991667829,
-0.0502421869332435, -0.341488748282317, 1.4388796649716,
-0.764303423167326, -0.458483028395103, -1.35762858619557
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))

Replicating a Data Visualization with R/ggplot

Replicating a visualization I saw in print media using ggplot2
Context:
I am always looking to make data visualizations more appealing/aesthetic specifically for non-data people, who are the majority of people I work with (stakeholders like marketers, management, etc) -- I've noted that when visualizations look like academic-publication-quality (standard ggplot2 aesthetics) they tend to assume they can't understand it and don't bother trying, defeating the whole purpose of visualizations in the first place. However, when it looks more graphic'y (like something you may see on websites or marketing material) they focus and try to understand the visualization, usually successfully. Often we'll end up in the most interesting discussions from these types of visualizations, so that is my ultimate goal.
The Visualization:
Here is something I saw on some marketing brochure on the device share of web traffic by geo, and though it is actually a bit busy and unclear, it resonated better than a similar stacked bar chart I created in standard -- I have not the slightest idea how I might replicate something like this within ggplot2, any attempts would be much appreciated! Here is some sample tidy data to use in a data.table:
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
You could also consider googleVis
library(googleVis)
dat <- structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
link_order <- unique(dat$country)
node_order <- unique(as.vector(rbind(dat$country, as.character(dat$device_type))))
link_cols <- data.frame(color = c('#ffd1ab', '#ff8d14', '#ff717e', '#dd2c40', '#d6b0ea',
'#8c4fab','#00addb','#297cbe'),
country = c("UK", "Canada", "USA", "China", "Spain", "Japan", "Argentina", "Brazil"),
stringsAsFactors = F)
node_cols <- data.frame(color = c("#ffc796", "#ff7100", "#ff485b", "#d20000",
"#cc98e6", "#6f2296", "#009bd2", "#005daf",
"grey", "grey", "grey"),
type = c("UK", "Canada", "USA", "China", "Spain", "Japan",
"Argentina", "Brazil", "multi", "desktop", "mobile"))
link_cols2 <- sapply(link_order, function(x) link_cols[x == link_cols$country, "color"])
node_cols2 <- sapply(node_order, function(x) node_cols[x == node_cols$type, "color"])
actual_link_cols <- paste0("[", paste0("'", link_cols2,"'", collapse = ','), "]")
actual_node_cols <- paste0("[", paste0("'", node_cols2,"'", collapse = ','), "]")
opts <- paste0("{
link: { colorMode: 'source',
colors: ", actual_link_cols ," },
node: {colors: ", actual_node_cols ,"}}")
Sankey <- gvisSankey(dat,
from = "country",
to = "device_type",
weight = "proportion",
options = list(height = 500, width = 1000, sankey = opts))
plot(Sankey)
You can try with "ggalluvial" package and its respective "geom".
Chek this out

Mapping points in a map

I try to show country data in a map using points in the map. Here the dataframe:
> dput(countries)
structure(list(country = structure(c(5L, 6L, 3L, 4L, 10L, 8L,
11L, 7L, 1L, 13L, 9L, 12L, 2L), .Label = c("Australia", "China",
"France", "Georgia", "India", "Ireland", "Malaysia", "Poland",
"Qatar", "Singapore", "South Africa", "Spain", "USA"), class = "factor"),
Latitude = c(20.593684, 53.142367, 46.227638, 32.165622,
1.352083, 51.919438, -30.559482, 4.210484, -25.274398, 37.09024,
25.354826, 40.463667, 35.86166), Longitude = c(78.96288,
-7.692054, 2.213749, -82.900075, 103.819836, 19.145136, 22.937506,
101.975766, 133.775136, -95.712891, 51.183884, -3.74922,
104.195397), Value = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 2L)), .Names = c("country", "Latitude", "Longitude",
"Value"), class = "data.frame", row.names = c(NA, -13L))
The code from here:
library(maps)
library(ggplot2)
base_world <- map_data("world")
map_data_coloured <-
base_world +
geom_point(data=countries,
aes(x=Longitude, y=Latitude, colour=Value), size=5, alpha=I(0.7))
But I receive this error:
Error in as.vector(x, mode) :
cannot coerce type 'environment' to vector of type 'any'
you need to pass the geom_polygon argument to map your base_world object
ggplot() +
geom_polygon(data=base_world, aes(x=long, y=lat, group=group)) +
geom_point(data=countries, aes(x=Longitude, y=Latitude, colour=Value), size=5, alpha=I(0.7))

How to change values more efficiently

I want to replace the , with a . as the decimal point in a dataframe. I can do this with df$X2005 <- as.numeric(gsub(',', '.', df$X2005)) for each variable. Is there a more efficient way to do this for the whole dataframe at once?
Some example data:
df <- structure(list(country = structure(1:6, .Label = c("Australia", "Austria", "Belgium", "Canada", "Chile", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Korea", "Luxembourg", "Mexico", "Netherlands", "New Zealand", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "United Kingdom", "United States"), class = "factor"), X2005 = structure(c(26L, 2L, 34L, 33L, 13L, 14L), .Label = c("10,3533", "10,4187", "10,8089", "10,8629", "10,882", "11,0173", "15,8399", "5,0226", "5,4488", "5,6273", "5,8713", "6,2137", "6,6397", "6,9339", "7,0448", "7,5719", "7,8534", "7,9457", "8,1819", "8,2668", "8,2883", "8,3556", "8,394", "8,4295", "8,4456", "8,4794", "8,7437", "9,0304", "9,0615", "9,4427", "9,6618", "9,77", "9,8295", "9,9833"), class = "factor"), X2006 = structure(c(25L, 2L, 31L, 34L, 13L, 14L), .Label = c("10,0326", "10,2177", "10,3877", "10,6374", "10,7468", "10,9516", "15,9368", "5,0169", "5,6845", "5,8109", "6,1019", "6,2008", "6,285", "6,6937", "7,3477", "7,5148", "7,5836", "7,7495", "8,1986", "8,2586", "8,2807", "8,3448", "8,39", "8,4289", "8,5204", "8,564", "8,8247", "8,8401", "8,948", "9,1292", "9,4811", "9,7487", "9,9243", "9,9621"), class = "factor"), X2007 = structure(c(27L, 3L, 31L, 1L, 14L, 13L), .Label = c("10,0263", "10,2099", "10,2617", "10,4771", "10,7642", "10,8754", "16,1608", "5,1597", "5,7779", "6,0372", "6,3331", "6,3858", "6,5223", "6,5494", "7,1288", "7,6299", "7,6744", "7,7553", "7,8565", "7,9023", "8,043", "8,2295", "8,4769", "8,4908", "8,5014", "8,504", "8,5531", "8,746", "8,9172", "9,0913", "9,5254", "9,8104", "9,9873", "9,9942"), class = "factor"), X2008 = structure(c(26L, 6L, 34L, 4L, 17L, 15L), .Label = c("10,1268", "10,183", "10,2189", "10,2537", "10,289", "10,4896", "10,7042", "10,9909", "11,0232", "16,6201", "5,8474", "6,0577", "6,0745", "6,586", "6,8189", "6,8863", "7,1361", "7,1819", "7,4631", "7,7052", "8,0208", "8,3068", "8,3457", "8,5513", "8,605", "8,751", "8,8915", "8,9402", "8,9521", "9,0591", "9,1344", "9,2284", "9,3051", "9,9128"), class = "factor"), X2009 = structure(c(24L, 8L, 5L, 9L, 21L, 22L), .Label = c("", "10,0115", "10,0496", "10,1957", "10,5938", "10,8137", "11,0005", "11,1729", "11,3992", "11,4722", "11,7314", "11,7516", "11,8823", "17,6706", "6,4098", "7,039", "7,1018", "7,2127", "7,6797", "7,7356", "7,8649", "7,9514", "7,9657", "9,0423", "9,152", "9,17", "9,1947", "9,4037", "9,5258", "9,6247", "9,636", "9,6743", "9,9056", "9,939"), class = "factor"), X2010 = structure(c(23L, 6L, 3L, 8L, 18L, 19L), .Label = c("", "10,1995", "10,503", "10,797", "10,8817", "11,0318", "11,0751", "11,3738", "11,5495", "11,677", "12,0661", "17,6911", "6,1782", "6,3394", "7,0229", "7,1675", "7,2911", "7,37", "7,4319", "7,6856", "8,0302", "8,8718", "8,9481", "8,9888", "8,995", "9,2925", "9,312", "9,4079", "9,4224", "9,4688", "9,5277", "9,5504", "9,589", "9,6074"), class = "factor"), X2011 = structure(c(NA, 5L, 4L, 8L, 18L, 17L), .Label = c("", "10,2345", "10,2844", "10,5139", "10,7769", "10,8671", "11,0148", "11,1784", "11,3323", "11,6343", "11,9369", "17,683", "5,922", "6,6464", "6,8709", "7,3692", "7,5011", "7,5206", "7,7333", "7,8877", "7,9416", "8,8501", "8,9042", "9,0019", "9,027", "9,1296", "9,2256", "9,2837", "9,2969", "9,4184", "9,4661"), class = "factor"), X2012 = structure(c(NA, NA, NA, 2L, 5L, NA), .Label = c("", "11,2132", "11,2955", "7,5249", "7,6077", "7,8226", "8,7596", "8,923", "9,148", "9,167", "9,3722"), class = "factor")), .Names = c("country", "X2005", "X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012"), row.names = c(NA, 6L), class = "data.frame")
You can instead read your data in the correct format.
If you use read.table for instance, change the dec = "," parameter. You'll probably need to change the na.strings = "<NA>" parameter as well.
You could use a for loop:
for (col in names(df[,-1])) {
df[,col] <- as.numeric(gsub(",", ".", df[,col]))
}

Resources