Related
I am currently working to merge two datasets in R. The first is a cross-national longitudinal dataset of democracy scores and inequality levels for countries over hundreds of years (15,034 observations, dat_as). The second is a cross-national longitudinal dataset of whether a given country in a given year has a legislature (27,192 observations, dat_vdem). I want to attach the legislatures data to the inequality data. The goal is to have a final df with the same number of observations (15,034). If there is a match, merge the data. If there is not a match, just insert an NA for the row. Every approach I have tried in R does not work. For example, using this code I get a df with 2,558,975 observations.
# load data
dat_as <- read.csv("as.csv")
dat_vdem <- read.csv("vdem.csv")
# merge
test_df <- merge(dat_as, dat_vdem, by = c("code"))
Using this code, however, I get a df with 13,355 observations.
test_df <- merge(dat_as, dat_vdem, by = c("country", "year"))
What am I doing wrong? Any help would be appreciated. Below are reproducible data.
Here is the dat_as:
structure(list(X = 1:6, country = c("United States", "United States",
"United States", "United States", "United States", "United States"
), year = 1800:1805, scode = c("USA", "USA", "USA", "USA", "USA",
"USA"), code = c("USA", "USA", "USA", "USA", "USA", "USA"), democracy = c(1L,
1L, 1L, 1L, 1L, 1L), lagdemocracy = c(NA, 1L, 1L, 1L, 1L, 1L),
lbmginiint = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), lbmgdppint = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), ldemlbmginiint = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), ldemlbmgdppint = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), yearsq = c(3240000,
3243601, 3247204, 3250809, 3254416, 3258025), legislature = c(NA,
NA, NA, NA, NA, NA)), row.names = c(NA, 6L), class = "data.frame")
Here is the dat_vdem:
structure(list(X = 1:6, year = 1800:1805, country = c("United States", "United States", "United States", "United States", "United States", "United States"), code = c("USA",
"USA", "USA", "USA", "USA", "USA"), v2lgbicam = c(0L, 0L, 0L,
0L, 0L, 0L), v2lgqstexp = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), v2lgotovst = c(-2.1, -2.1, -2.1, -2.1, -2.1,
-2.1), v2lginvstp = c(-2.05, -2.05, -2.05, -2.05, -2.05, -2.05
), legislature = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
6L), class = "data.frame")
You're describing a left join. The way I find easier is to use dplyr.
dplyr::left_join(dat_as, dat_vdem).
By default it will try and guess which key variables to match by. With the sample data you provided, it matched by "X", "country", "year", "code", "legislature". But you can specify them if need be.
I'm trying to display the flags for each country. But it seems most of the flags are incorrect according to my code. I want them to be displayed only at the end of the geom line, add the flag to the legend, and make the flag more visible than how it displays now.
categ_top10EnergyModf %>%
mutate(country = tolower(country)) %>%
ggplot(aes(x= year, y=ggwt_hours, country = country, color=country, group=country))+
geom_line(size=1.5)+
geom_point(size=3)+
geom_flag(aes(country = factor(country), size = 4))+
scale_y_continuous(labels = scales::comma)+
facet_wrap(~type2,scale='free')+
labs(x= "Year", y= "Energy Production (GWh)", title = "Analysis of the Growth of Renewable/Non-Renewable Energy Production",
color="Country",fill = "country" )+
scale_color_discrete(name = "Country",
labels= c("Germany",
"Spain",
"France",
"Italy",
"Norway",
"Poland",
"Sweden",
"Turkey",
"Ukraine",
"United Kingdom")
)+
theme_grey() +
theme(plot.title = element_text(hjust = 0.5))
> dput(head(categ_top10EnergyModf))
structure(list(country = c("de", "de", "fr", "fr", "fr", "de"
), country_name = c("Germany", "Germany", "France", "France",
"France", "Germany"), type2 = c("Non-Renewable", "Non-Renewable",
"Non-Renewable", "Non-Renewable", "Non-Renewable", "Non-Renewable"
), year = structure(c(1L, 2L, 2L, 3L, 1L, 3L), .Label = c("2016",
"2017", "2018"), class = "factor"), ggwt_hours = c(471984, 449906,
448690.614, 447109.694, 445175.494, 393234.585)), row.names = c(NA,
-6L), groups = structure(list(country = c("de", "de", "de", "fr",
"fr", "fr"), country_name = c("Germany", "Germany", "Germany",
"France", "France", "France"), year = structure(c(1L, 2L, 3L,
1L, 2L, 3L), .Label = c("2016", "2017", "2018"), class = "factor"),
.rows = structure(list(1L, 2L, 6L, 5L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You could display a flag as the last point:
geom_flag(aes(x = ifelse(year == 2018, year, NA)), size = 4)
I also set the size not as an aesthetic.
You dont have to use factor(country). You defined your country aesthetic in ggplot already.
I am writing a model for my dataset. Once the bake() is executed, the result has one missing predictor and the outcome.
This happens after writing the recipe steps. Is there any way to resolve this issue?
top10_renewableEnergyProd_split <- initial_split(top10_renewableEnergyProd)
top10_renewableEnergyProd_train <- training(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_test <- testing(top10_renewableEnergyProd_split)
top10_renewableEnergyProd_recipe <- recipe(energyProd_2018 ~ country_name + energyProd_2016 + energyProd_2017 , data = top10_renewableEnergyProd_train)
#recipe steps
top10_renewableEnergyProd_recipe <- top10_renewableEnergyProd_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric())
top10_renewableEnergyProd_prep <- prep(top10_renewableEnergyProd_recipe, training = top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake <- bake(top10_renewableEnergyProd_prep, top10_renewableEnergyProd_train)
top10_renewableEnergyProd_bake
> dput(top10_renewableEnergyProd)
structure(list(type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), country = c("DE", "ES", "FR", "IT",
"NO", "PL", "SE", "TR", "UA", "UK"), country_name = c("Germany",
"Spain", "France", "Italy", "Norway", "Poland", "Sweden", "Turkey",
"Ukraine", "United Kingdom"), energyProd_2016 = c(147622, 103353,
99885.054, 90756.826, 146557, 15468, 77505, 87090, 12097, 58909.047
), energyProd_2017 = c(175063, 84664, 93907.184, 86786.294, 146285,
18187.708, 82540, 83536.342, 12082.6, 73113.964), energyProd_2018 = c(185226.211,
99725.566, 113658.177, 96820, 146878.825, 15541.473, 77615.947,
93425.906, 13843.9, 79955.967)), row.names = c(NA, -10L), groups = structure(list(
country = c("DE", "ES", "FR", "IT", "NO", "PL", "SE", "TR",
"UA", "UK"), country_name = c("Germany", "Spain", "France",
"Italy", "Norway", "Poland", "Sweden", "Turkey", "Ukraine",
"United Kingdom"), type2 = c("Renewable", "Renewable", "Renewable",
"Renewable", "Renewable", "Renewable", "Renewable", "Renewable",
"Renewable", "Renewable"), .rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
> dput(top10_renewableEnergyProd_bake)
structure(list(country_name = structure(c(2L, 5L, 1L, 3L, 4L,
6L, 7L, 8L), .Label = c("France", "Germany", "Italy", "Norway",
"Spain", "Sweden", "Turkey", "United Kingdom"), class = "factor"),
energyProd_2016 = c(1.47285970883518, 0.0604065991667829,
-0.0502421869332435, -0.341488748282317, 1.4388796649716,
-0.764303423167326, -0.458483028395103, -1.35762858619557
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))
Replicating a visualization I saw in print media using ggplot2
Context:
I am always looking to make data visualizations more appealing/aesthetic specifically for non-data people, who are the majority of people I work with (stakeholders like marketers, management, etc) -- I've noted that when visualizations look like academic-publication-quality (standard ggplot2 aesthetics) they tend to assume they can't understand it and don't bother trying, defeating the whole purpose of visualizations in the first place. However, when it looks more graphic'y (like something you may see on websites or marketing material) they focus and try to understand the visualization, usually successfully. Often we'll end up in the most interesting discussions from these types of visualizations, so that is my ultimate goal.
The Visualization:
Here is something I saw on some marketing brochure on the device share of web traffic by geo, and though it is actually a bit busy and unclear, it resonated better than a similar stacked bar chart I created in standard -- I have not the slightest idea how I might replicate something like this within ggplot2, any attempts would be much appreciated! Here is some sample tidy data to use in a data.table:
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
You could also consider googleVis
library(googleVis)
dat <- structure(list(country = c("Argentina", "Argentina", "Argentina",
"Brazil", "Brazil", "Brazil", "Canada",
"Canada", "Canada", "China", "China",
"China", "Japan", "Japan", "Japan", "Spain",
"Spain", "Spain", "UK", "UK", "UK", "USA",
"USA", "USA"),
device_type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L),
class = "factor",
.Label = c("desktop",
"mobile",
"multi")),
proportion = c(0.37, 0.22, 0.41, 0.3, 0.31, 0.39,
0.35, 0.06, 0.59, 0.19, 0.2, 0.61,
0.4, 0.18, 0.42, 0.16, 0.28, 0.56,
0.27, 0.06, 0.67, 0.37, 0.08, 0.55)),
.Names = c("country", "device_type", "proportion"),
row.names = c(NA, -24L),
class = c("data.table", "data.frame"))
link_order <- unique(dat$country)
node_order <- unique(as.vector(rbind(dat$country, as.character(dat$device_type))))
link_cols <- data.frame(color = c('#ffd1ab', '#ff8d14', '#ff717e', '#dd2c40', '#d6b0ea',
'#8c4fab','#00addb','#297cbe'),
country = c("UK", "Canada", "USA", "China", "Spain", "Japan", "Argentina", "Brazil"),
stringsAsFactors = F)
node_cols <- data.frame(color = c("#ffc796", "#ff7100", "#ff485b", "#d20000",
"#cc98e6", "#6f2296", "#009bd2", "#005daf",
"grey", "grey", "grey"),
type = c("UK", "Canada", "USA", "China", "Spain", "Japan",
"Argentina", "Brazil", "multi", "desktop", "mobile"))
link_cols2 <- sapply(link_order, function(x) link_cols[x == link_cols$country, "color"])
node_cols2 <- sapply(node_order, function(x) node_cols[x == node_cols$type, "color"])
actual_link_cols <- paste0("[", paste0("'", link_cols2,"'", collapse = ','), "]")
actual_node_cols <- paste0("[", paste0("'", node_cols2,"'", collapse = ','), "]")
opts <- paste0("{
link: { colorMode: 'source',
colors: ", actual_link_cols ," },
node: {colors: ", actual_node_cols ,"}}")
Sankey <- gvisSankey(dat,
from = "country",
to = "device_type",
weight = "proportion",
options = list(height = 500, width = 1000, sankey = opts))
plot(Sankey)
You can try with "ggalluvial" package and its respective "geom".
Chek this out
I want to replace the , with a . as the decimal point in a dataframe. I can do this with df$X2005 <- as.numeric(gsub(',', '.', df$X2005)) for each variable. Is there a more efficient way to do this for the whole dataframe at once?
Some example data:
df <- structure(list(country = structure(1:6, .Label = c("Australia", "Austria", "Belgium", "Canada", "Chile", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Korea", "Luxembourg", "Mexico", "Netherlands", "New Zealand", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "United Kingdom", "United States"), class = "factor"), X2005 = structure(c(26L, 2L, 34L, 33L, 13L, 14L), .Label = c("10,3533", "10,4187", "10,8089", "10,8629", "10,882", "11,0173", "15,8399", "5,0226", "5,4488", "5,6273", "5,8713", "6,2137", "6,6397", "6,9339", "7,0448", "7,5719", "7,8534", "7,9457", "8,1819", "8,2668", "8,2883", "8,3556", "8,394", "8,4295", "8,4456", "8,4794", "8,7437", "9,0304", "9,0615", "9,4427", "9,6618", "9,77", "9,8295", "9,9833"), class = "factor"), X2006 = structure(c(25L, 2L, 31L, 34L, 13L, 14L), .Label = c("10,0326", "10,2177", "10,3877", "10,6374", "10,7468", "10,9516", "15,9368", "5,0169", "5,6845", "5,8109", "6,1019", "6,2008", "6,285", "6,6937", "7,3477", "7,5148", "7,5836", "7,7495", "8,1986", "8,2586", "8,2807", "8,3448", "8,39", "8,4289", "8,5204", "8,564", "8,8247", "8,8401", "8,948", "9,1292", "9,4811", "9,7487", "9,9243", "9,9621"), class = "factor"), X2007 = structure(c(27L, 3L, 31L, 1L, 14L, 13L), .Label = c("10,0263", "10,2099", "10,2617", "10,4771", "10,7642", "10,8754", "16,1608", "5,1597", "5,7779", "6,0372", "6,3331", "6,3858", "6,5223", "6,5494", "7,1288", "7,6299", "7,6744", "7,7553", "7,8565", "7,9023", "8,043", "8,2295", "8,4769", "8,4908", "8,5014", "8,504", "8,5531", "8,746", "8,9172", "9,0913", "9,5254", "9,8104", "9,9873", "9,9942"), class = "factor"), X2008 = structure(c(26L, 6L, 34L, 4L, 17L, 15L), .Label = c("10,1268", "10,183", "10,2189", "10,2537", "10,289", "10,4896", "10,7042", "10,9909", "11,0232", "16,6201", "5,8474", "6,0577", "6,0745", "6,586", "6,8189", "6,8863", "7,1361", "7,1819", "7,4631", "7,7052", "8,0208", "8,3068", "8,3457", "8,5513", "8,605", "8,751", "8,8915", "8,9402", "8,9521", "9,0591", "9,1344", "9,2284", "9,3051", "9,9128"), class = "factor"), X2009 = structure(c(24L, 8L, 5L, 9L, 21L, 22L), .Label = c("", "10,0115", "10,0496", "10,1957", "10,5938", "10,8137", "11,0005", "11,1729", "11,3992", "11,4722", "11,7314", "11,7516", "11,8823", "17,6706", "6,4098", "7,039", "7,1018", "7,2127", "7,6797", "7,7356", "7,8649", "7,9514", "7,9657", "9,0423", "9,152", "9,17", "9,1947", "9,4037", "9,5258", "9,6247", "9,636", "9,6743", "9,9056", "9,939"), class = "factor"), X2010 = structure(c(23L, 6L, 3L, 8L, 18L, 19L), .Label = c("", "10,1995", "10,503", "10,797", "10,8817", "11,0318", "11,0751", "11,3738", "11,5495", "11,677", "12,0661", "17,6911", "6,1782", "6,3394", "7,0229", "7,1675", "7,2911", "7,37", "7,4319", "7,6856", "8,0302", "8,8718", "8,9481", "8,9888", "8,995", "9,2925", "9,312", "9,4079", "9,4224", "9,4688", "9,5277", "9,5504", "9,589", "9,6074"), class = "factor"), X2011 = structure(c(NA, 5L, 4L, 8L, 18L, 17L), .Label = c("", "10,2345", "10,2844", "10,5139", "10,7769", "10,8671", "11,0148", "11,1784", "11,3323", "11,6343", "11,9369", "17,683", "5,922", "6,6464", "6,8709", "7,3692", "7,5011", "7,5206", "7,7333", "7,8877", "7,9416", "8,8501", "8,9042", "9,0019", "9,027", "9,1296", "9,2256", "9,2837", "9,2969", "9,4184", "9,4661"), class = "factor"), X2012 = structure(c(NA, NA, NA, 2L, 5L, NA), .Label = c("", "11,2132", "11,2955", "7,5249", "7,6077", "7,8226", "8,7596", "8,923", "9,148", "9,167", "9,3722"), class = "factor")), .Names = c("country", "X2005", "X2006", "X2007", "X2008", "X2009", "X2010", "X2011", "X2012"), row.names = c(NA, 6L), class = "data.frame")
You can instead read your data in the correct format.
If you use read.table for instance, change the dec = "," parameter. You'll probably need to change the na.strings = "<NA>" parameter as well.
You could use a for loop:
for (col in names(df[,-1])) {
df[,col] <- as.numeric(gsub(",", ".", df[,col]))
}