Related
I have divided my plots into 2 based on Sportbook and Casino. How is it possible to also split line into several lines (different colors) to show different markets? I tried to use fill=market at the end of ggplot function, however it did not help.
library(ggplot2)
data<-structure(list(wday = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L,
7L, 7L, 7L), .Label = c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"), class = "factor"), market = c("France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland", "France",
"France", "Germany", "Germany", "Poland", "Poland"), product_preference = c("Casino",
"Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook",
"Casino", "Sportsbook", "Casino", "Sportsbook", "Casino", "Sportsbook"
), ggr = c(3349.80897892753, 161.917715712988, 17700.4568364611,
-123.342131455399, 17208.7731385281, 3128.51277864992, 2877.17330617787,
28.5162781278127, 13453.7092912371, -82.8980672268908, 13611.1197727273,
9910.32070866143, 3939.20578803854, 126.311590466926, 19097.2664228723,
-94.5491666666667, 16706.9427008929, 2636.63687707641, 3393.43150322119,
176.953280238925, 23414.9515950069, -72.4428986866791, 16140.8680085653,
5618.00758333333, 3007.18322084806, 69.4383454281568, 18018.1755748663,
-77.87698, 19889.0339183673, 5561.69038585209, 4205.12735472371,
-16.0552268431002, 17166.1121932115, -117.149356025759, 18527.8546597938,
6806.36808346213, 3446.70375835385, 56.6674850849013, 18026.2400535475,
-67.3431629701062, 13641.4965135699, 11470.3083969466)), row.names = c(NA,
-42L), groups = structure(list(wday = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L), .Label = c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"), class = "factor"), market = c("France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland", "France", "Germany", "Poland", "France",
"Germany", "Poland"), .rows = structure(list(1:2, 3:4, 5:6, 7:8,
9:10, 11:12, 13:14, 15:16, 17:18, 19:20, 21:22, 23:24, 25:26,
27:28, 29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), na.action = structure(43:46, .Names = c("43",
"44", "45", "46"), class = "omit"), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
ggplot() +
geom_line(data = data,aes(x = wday, y = ggr,group = product_preference))+
facet_grid(.~product_preference,scales="free")
You can define the color of your lines by adding color = <grouping variable>.
Also, you already do a facet grid on product_preference, so there seems to be no need to define group = product_preference.
Try this:
ggplot() +
geom_line(data = data,aes(x = wday, y = ggr,color = market, group = market)) +
facet_grid(.~product_preference,scales="free")
I am not sure this is what you wanted, but I would replace geom_line with 'geom_col'. 'geom_col' also takes the 'fill' attribute, while 'geom_line' would require 'color'.
So my suggestion would be the following:
ggplot() +
geom_col(data = data,aes(x = wday, y = ggr, group = product_preference, fill = market))+
facet_grid(.~product_preference,scales="free")
This results in the following plot:
I am writing a model for my dataset. Once the bake() is executed, the result has one missing predictor and the outcome.
This happens after writing the recipe steps. Is there any way to resolve this issue?
top10_renewableEnergyProd_split <- initial_split(top10_renewableEnergyProd)
top10_renewableEnergyProd_train <- training(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_test <- testing(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_recipe <- recipe(energyProd_2018 ~ country_name + energyProd_2016 + energyProd_2017 , data = top10_renewableEnergyProd_train)
#recipe steps
top10_renewableEnergyProd_recipe <- top10_renewableEnergyProd_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric())
top10_renewableEnergyProd_prep <- prep(top10_renewableEnergyProd_recipe, training = top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake <- bake(top10_renewableEnergyProd_prep, top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake
> dput(top10_renewableEnergyProd)
structure(list(type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), country = c("DE", "ES", "FR", "IT",
"NO", "PL", "SE", "TR", "UA", "UK"), country_name = c("Germany",
"Spain", "France", "Italy", "Norway", "Poland", "Sweden", "Turkey",
"Ukraine", "United Kingdom"), energyProd_2016 = c(147622, 103353,
99885.054, 90756.826, 146557, 15468, 77505, 87090, 12097, 58909.047
), energyProd_2017 = c(175063, 84664, 93907.184, 86786.294, 146285,
18187.708, 82540, 83536.342, 12082.6, 73113.964), energyProd_2018 = c(185226.211,
99725.566, 113658.177, 96820, 146878.825, 15541.473, 77615.947,
93425.906, 13843.9, 79955.967)), row.names = c(NA, -10L), groups = structure(list(
country = c("DE", "ES", "FR", "IT", "NO", "PL", "SE", "TR",
"UA", "UK"), country_name = c("Germany", "Spain", "France",
"Italy", "Norway", "Poland", "Sweden", "Turkey", "Ukraine",
"United Kingdom"), type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), .rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
> dput(top10_renewableEnergyProd_bake)
structure(list(country_name = structure(c(2L, 5L, 1L, 3L, 4L,
6L, 7L, 8L), .Label = c("France", "Germany", "Italy", "Norway",
"Spain", "Sweden", "Turkey", "United Kingdom"), class = "factor"),
energyProd_2016 = c(1.47285970883518, 0.0604065991667829,
-0.0502421869332435, -0.341488748282317, 1.4388796649716,
-0.764303423167326, -0.458483028395103, -1.35762858619557
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))
I have these two datasets that I am trying to append:
data1 = structure(list(year = c(2017, 2018), flow = c("Export", "Export"
), EUR = c(4, 3.44), Home = c(3.09, 3.03), Not_reported = c(0.12,
0), USD = c(92.29, 93.04), country = c("Brazil", "Brazil"), Other = c(0.499999999999994,
0.489999999999994)), row.names = c(NA, -2L), vars = c("year",
"flow"), drop = TRUE, indices = list(0L, 1L), group_sizes = c(1L,
1L), biggest_group_size = 1L, labels = structure(list(year = c(2017,
2018), flow = c("Export", "Export")), row.names = c(NA, -2L), vars = c("year",
"flow"), drop = TRUE, indices = list(0L, 1L), group_sizes = c(1L,
1L), biggest_group_size = 1L, labels = structure(list(year = c(2017,
2018), flow = c("EXP", "EXP")), class = "data.frame", row.names = c(NA,
-2L), vars = c("year", "flow"), drop = TRUE), class = "data.frame"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
data2 = structure(list(flow = c("Export", "Export", "Export", "Export",
"Export", "Import"), country = structure(c(6L, 6L, 6L, 6L, 6L,
6L), .Label = c("Algeria", "Argentina", "Australia", "Austria",
"Belgium", "Brazil", "Bulgaria", "Canada", "China", "Colombia",
"Cyprus", "Czech Republic", "Denmark", "Estonia", "Euro", "Finland",
"France", "Germany", "Greece", "Hungary", "Iceland", "India",
"Indonesia", "Ireland", "Israel", "Italy", "Japan", "Latvia",
"Lithuania", "Luxembourg", "Malaysia", "Malta", "Morocco", "Netherlands",
"Pakistan", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia",
"South Africa", "South Korea", "Spain", "Sweden", "Switzerland",
"Thailand", "Ukraine", "United Kingdom", "United States"), class = "factor"),
year = c(2007, 2008, 2009, 2010, 2011, 2007), EUR = c(4.76,
4.95, 4.51, 4.28, 3.8, 11.1), Home = c(0.13, 0.16, 1.11,
0.82, NA, 0.48), Not_reported = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), USD = c(94.7, 94.4, 93.8,
94.3, 94.5, 85.5), Other = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_)), row.names = c(NA, 6L), class = "data.frame")
When I tried:
rbind(data1, data2)
I got a list instead of a dataframe. I have checked the class of each column and they seem consistent with each other. Can someone explain to me? Thanks!
I want to replace the , with a . as the decimal point in a dataframe. I can do this with df$X2005 <- as.numeric(gsub(',', '.', df$X2005)) for each variable. Is there a more efficient way to do this for the whole dataframe at once?
Some example data:
df <- structure(list(country = structure(1:6, .Label = c("Australia", "Austria", "Belgium", "Canada", "Chile", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Korea", "Luxembourg", "Mexico", "Netherlands", "New Zealand", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "United Kingdom", "United States"), class = "factor"), X2005 = structure(c(26L, 2L, 34L, 33L, 13L, 14L), .Label = c("10,3533", "10,4187", "10,8089", "10,8629", "10,882", "11,0173", "15,8399", "5,0226", "5,4488", "5,6273", "5,8713", "6,2137", "6,6397", "6,9339", "7,0448", "7,5719", "7,8534", "7,9457", "8,1819", "8,2668", "8,2883", "8,3556", "8,394", "8,4295", "8,4456", "8,4794", "8,7437", "9,0304", "9,0615", "9,4427", "9,6618", "9,77", "9,8295", "9,9833"), class = "factor"), X2006 = structure(c(25L, 2L, 31L, 34L, 13L, 14L), .Label = c("10,0326", "10,2177", "10,3877", "10,6374", "10,7468", "10,9516", "15,9368", "5,0169", "5,6845", "5,8109", "6,1019", "6,2008", "6,285", "6,6937", "7,3477", "7,5148", "7,5836", "7,7495", "8,1986", "8,2586", "8,2807", "8,3448", "8,39", "8,4289", "8,5204", "8,564", "8,8247", "8,8401", "8,948", "9,1292", "9,4811", "9,7487", "9,9243", "9,9621"), class = "factor"), X2007 = structure(c(27L, 3L, 31L, 1L, 14L, 13L), .Label = c("10,0263", "10,2099", "10,2617", "10,4771", "10,7642", "10,8754", "16,1608", "5,1597", "5,7779", "6,0372", "6,3331", "6,3858", "6,5223", "6,5494", "7,1288", "7,6299", "7,6744", "7,7553", "7,8565", "7,9023", "8,043", "8,2295", "8,4769", "8,4908", "8,5014", "8,504", "8,5531", "8,746", "8,9172", "9,0913", "9,5254", "9,8104", "9,9873", "9,9942"), class = "factor"), X2008 = structure(c(26L, 6L, 34L, 4L, 17L, 15L), .Label = c("10,1268", "10,183", "10,2189", "10,2537", "10,289", "10,4896", "10,7042", "10,9909", "11,0232", "16,6201", "5,8474", "6,0577", "6,0745", "6,586", "6,8189", "6,8863", "7,1361", "7,1819", "7,4631", "7,7052", "8,0208", "8,3068", "8,3457", "8,5513", "8,605", "8,751", "8,8915", "8,9402", "8,9521", "9,0591", "9,1344", "9,2284", "9,3051", "9,9128"), class = "factor"), X2009 = structure(c(24L, 8L, 5L, 9L, 21L, 22L), .Label = c("", "10,0115", "10,0496", "10,1957", "10,5938", "10,8137", "11,0005", "11,1729", "11,3992", "11,4722", "11,7314", "11,7516", "11,8823", "17,6706", "6,4098", "7,039", "7,1018", "7,2127", "7,6797", "7,7356", "7,8649", "7,9514", "7,9657", "9,0423", "9,152", "9,17", "9,1947", "9,4037", "9,5258", "9,6247", "9,636", "9,6743", "9,9056", "9,939"), class = "factor"), X2010 = structure(c(23L, 6L, 3L, 8L, 18L, 19L), .Label = c("", "10,1995", "10,503", "10,797", "10,8817", "11,0318", "11,0751", "11,3738", "11,5495", "11,677", "12,0661", "17,6911", "6,1782", "6,3394", "7,0229", "7,1675", "7,2911", "7,37", "7,4319", "7,6856", "8,0302", "8,8718", "8,9481", "8,9888", "8,995", "9,2925", "9,312", "9,4079", "9,4224", "9,4688", "9,5277", "9,5504", "9,589", "9,6074"), class = "factor"), X2011 = structure(c(NA, 5L, 4L, 8L, 18L, 17L), .Label = c("", "10,2345", "10,2844", "10,5139", "10,7769", "10,8671", "11,0148", "11,1784", "11,3323", "11,6343", "11,9369", "17,683", "5,922", "6,6464", "6,8709", "7,3692", "7,5011", "7,5206", "7,7333", "7,8877", "7,9416", "8,8501", "8,9042", "9,0019", "9,027", "9,1296", "9,2256", "9,2837", "9,2969", "9,4184", "9,4661"), class = "factor"), X2012 = structure(c(NA, NA, NA, 2L, 5L, NA), .Label = c("", "11,2132", "11,2955", "7,5249", "7,6077", "7,8226", "8,7596", "8,923", "9,148", "9,167", "9,3722"), class = "factor")), .Names = c("country", "X2005", "X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012"), row.names = c(NA, 6L), class = "data.frame")
You can instead read your data in the correct format.
If you use read.table for instance, change the dec = "," parameter. You'll probably need to change the na.strings = "<NA>" parameter as well.
You could use a for loop:
for (col in names(df[,-1])) {
df[,col] <- as.numeric(gsub(",", ".", df[,col]))
}
I have this data frame:
dput(df2)
structure(list(Receiver = structure(c(4L, 3L, 2L, 1L), .Label = c("Australia",
"United Arab Emirates", "United Kingdom", "United States of America"
), class = "factor"), Sender = structure(c(1L, 1L, 1L, 1L), .Label = "United States of America", class = "factor")), .Names = c("Receiver",
"Sender"), row.names = c(NA, -4L), class = "data.frame")
I would like to draw and igraph as this:
library(igraph)
g<-graph.data.frame(df2)
plot(g, layout = layout.kamada.kawai, vertex.label = V(g)$name,
vertex.label.color= "red", edge.arrow.size=0.8,
edge.curved=T, edge.label.color="white",
edge.label.cex=0.8,vertex.shape="circle",edge.color="pink",
vertex.color="lightblue", asp=0, margin=0)
I would like to show vertex lables inside the verexes, without increasing the size of the vertexes. Any ideas how I can do this?
You can do something like this before the call plot:
V(g)$label.cex <- 0.5
But why not to use a shortcut of the names?
V(g)$name<-c('USA','UK','UAE','Aus')
dat <- structure(list(Receiver = structure(c(4L, 3L, 2L, 1L), .Label = c("Australia",
"United Arab \nEmirates", "United \nKingdom", "United \nStates of \nAmerica"
), class = "factor"), Sender = structure(c(1L, 1L, 1L, 1L), .Label = "United \nStates of \nAmerica", class = "factor")), .Names = c("Receiver",
"Sender"), row.names = c(NA, -4L), class = "data.frame")
library(igraph)
g<-graph.data.frame(dat)
V(g)$label.cex <- 0.6
plot(g, layout = layout.kamada.kawai, vertex.label = V(g)$name,
vertex.label.color= "red", edge.arrow.size=0.8,
edge.curved=T, edge.label.color="white",
edge.label.cex=0.8,vertex.shape="circle",edge.color="pink",
vertex.color="lightblue", asp=0, margin=0)
The following might help.
# size the network nodes based on their centrality
deg = igraph::degree(graph = g, v = V(g), mode = "all", loops = TRUE, normalized = FALSE);
igraph::V(g)$size = deg*3;
# set the label size based on the node size
igraph::V(g)$label.cex = igraph::V(g)$size/max(igraph::V(g)$size);