complex ggplot in R - half circular bar plot - r

Okay so here is the challenge. How do recreate this chart?
The numbers and so on does not have to match, what I am really trying to do is create a circular bar chart in a gauge type layout with the gap. Headers and text is optional. More just the idea of a 3/4 circular bar chart.
Here is some example code that I am playing with:
library(ggplot2)
fixed_income.df <- data.frame(name = c("total","US Gov't Debt","US Municipal Debt",
"US IG Corp","US HY Corp","Int'l Developed",
"Emerging Market"),
allocation = c(3,1,4,3,4,2,3),
x_ax = c(1:7))
ggplot(fixed_income.df,aes(x = as.numeric(x_ax), y = allocation)) +
geom_bar(stat = "identity") +
ylim(-5,5) +
coord_polar(
theta = "x",
start=-3)
) + coord_flip()
which returns:
ANy help will earn a cookie! No really any help would be so appreciated, I am stuck..
Sody

The code for the basic plot is fairly simple (at least, without the annotations)
library(ggplot2)
ggplot(df, aes(xvals, yvals, fill = cols)) +
geom_col(width = 1) +
scale_y_continuous(limits = c(-2, 3)) +
scale_fill_manual(values = rev(c("#e9cbc1", "#b54649", "gray90",
"gray50", "#8ba55d", "#e2e4d6",
"white", "#c3a891", "#37959d",
"#5c7890", "#dcad3c", "#55a3b9",
"#f39068"))) +
theme_void() +
geom_vline(colour = "white", xintercept = c(0.5, 1.5, 8.5, 15.5, 16.5, 17.5),
size = 3) +
geom_segment(data = data.frame(x = 0.5 + 1:23, y = 0, yend = 1),
aes(x = x, y = 0, yend = 1, xend = x), colour = "white",
inherit.aes = FALSE) +
scale_x_continuous(expand = c(0.2, 1)) +
coord_polar(start = -pi) +
theme(legend.position = "none")
It's getting your data in the correct format that's going to be difficult:
df <- structure(list(xvals = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 23L, 23L, 23L,
23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L), yvals = c(0.45,
0, 0, 0.1, 0, 0.45, 0.5, 1, 0, 0, 0, 0, 0, 0.45, 0, 0.05, 0,
0.2, 0.3, 0.5, 0, 1, 0, 0, 0, 0, 0.3, 0.15, 0.05, 0, 0, 0.5,
0.5, 0, 1, 0, 0, 0, 0, 0.3, 0.15, 0.05, 0, 0, 0.5, 0.5, 0, 1,
0, 0, 0, 0, 0.45, 0, 0.05, 0, 0.2, 0.3, 0.5, 0, 1, 0, 0, 0, 0,
0.45, 0, 0.05, 0, 0.2, 0.3, 0.5, 0, 1, 0, 0, 0, 0, 0.45, 0, 0,
0.1, 0, 0.45, 0.5, 0, 1, 0, 0, 0, 0, 0.45, 0, 0, 0.1, 0, 0.45,
0.5, 0, 1, 0, 0, 0, 0, 0.3, 0.15, 0.05, 0, 0, 0.5, 0.5, 0, 0,
1, 0, 0, 0, 0.15, 0.3, 0.05, 0, 0, 0.5, 0.5, 0, 0, 1, 0, 0, 0,
0.45, 0, 0.05, 0, 0.2, 0.3, 0.5, 0, 0, 1, 0, 0, 0, 0.45, 0, 0,
0.1, 0, 0.45, 0.5, 0, 0, 1, 0, 0, 0, 0.45, 0, 0.05, 0, 0.2, 0.3,
0.5, 0, 0, 1, 0, 0, 0, 0.3, 0.15, 0.05, 0, 0, 0.5, 0.5, 0, 0,
1, 0, 0, 0, 0.45, 0, 0, 0.1, 0, 0.45, 0.5, 0, 0, 1, 0, 0, 0,
0.45, 0, 0, 0.1, 0, 0.45, 0.5, 0, 0, 0, 1, 0, 0, 0.45, 0, 0,
0.1, 0, 0.45, 0.5, 0, 0, 0, 0, 1, 0, 0.45, 0, 0.05, 0, 0.2, 0.3,
0.5, 0, 0, 0, 0, 0, 1, 0.3, 0.15, 0.05, 0, 0, 0.5, 0.5, 0, 0,
0, 0, 0, 1, 0.45, 0, 0, 0.1, 0, 0.45, 0.5, 0, 0, 0, 0, 0, 1,
0.45, 0, 0, 0.1, 0, 0.45, 0.5, 0, 0, 0, 0, 0, 1, 0.45, 0, 0,
0.1, 0, 0.45, 0.5, 0, 0, 0, 0, 0, 1, 0.45, 0, 0, 0.1, 0, 0.45,
0.5, 0, 0, 0, 0, 0, 1), cols = structure(c(13L, 12L, 11L, 10L,
9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L,
5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L,
3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L,
1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L,
12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L,
11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L,
10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L,
9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L,
5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L,
3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L,
1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L,
12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L,
11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L,
10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L,
9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L,
5L, 4L, 3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L,
3L, 2L, 1L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L,
1L), .Label = c("Nesting Variable 6", "Nesting Variable 5",
"Nesting Variable 4",
"Nesting Variable 3", "Nesting Variable 2", "Nesting Variable 1",
"blank", "mint", "green", "darkgray", "lightgray", "red", "pink"
), class = "factor")), class = "data.frame", row.names = c(NA,
-299L))

OMG blonde moment, the answer is so simple.. How did I miss it..
xlim()

Related

Aesthethics do not work for individual geom_line's in R's ggplot

I am trying to construct a plot with thin, transparent individual lines for individual people (indicated by the id column) and place a thick, solid average-across-people line on top of that.
My ggplot call looks like this:
ggplot(data, aes(x = card_self, y = percent)) +
geom_line(aes(group=id, color=id, size=1, alpha=0.8)) +
geom_line(data = data_averaged, aes(size=10, alpha=0.9)) +
guides(color='none', alpha='none', size='none')
The result is this:
The problem is that changing the size/alpha aesthetics does not work as expected. I've tried setting them outside the aes argument and inside and the results are as random as it gets.
My full code to reproduce the plot is here:
library("tidyverse")
library("ggplot")
data <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L,
23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L,
29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L), .Label = c("P001",
"P002", "P003", "P004", "P005", "P006", "P007", "P008", "P009",
"P010", "P011", "P012", "P013", "P014", "P015", "P016", "P017",
"P018", "P019", "P020", "P021", "P022", "P023", "P024", "P025",
"P026", "P027", "P028", "P029", "P030"), class = "factor"), card_self = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L,
5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L,
9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L,
11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L,
5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L,
9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L,
11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L,
5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L,
9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L,
11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L,
5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L,
9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L,
11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L,
5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L,
9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L,
11L, 12L, 13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L), .Label = c("ace",
"two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "jack", "queen", "king"), class = "factor"), percent = c(1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.89, 1, 0.67, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0.67, 0.22, 0.11, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0.89, 0, 0, 0,
0, 0, 1, 1, 0.89, 1, 1, 0.89, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.89,
0.56, 0, 0.22, 0, 0, 0, 0, 0.56, 0.89, 0.67, 0.89, 0.67, 0.89,
0.56, 0.67, 0.44, 0.89, 0.56, 0.56, 1, 1, 1, 1, 0.44, 0.56, 0.11,
0.22, 0.22, 0, 0, 0, 1, 1, 1, 1, 0.44, 0.33, 0.67, 0.33, 0.11,
0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.89, 1, 0.89,
0.78, 0.22, 0.56, 0.33, 0.22, 0.11, 0, 1, 1, 1, 1, 0.44, 0.11,
0, 0, 0, 0, 0, 0, 1, 0.78, 0.89, 0.56, 0.67, 0.67, 0.44, 0.44,
0.44, 0.67, 0.22, 0, 1, 1, 1, 1, 1, 0.78, 0.44, 0, 0.78, 0.44,
0, 0, 1, 1, 1, 1, 1, 1, 0.11, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
0, 0.22, 0, 0, 0, 0, 0.67, 0.44, 0.33, 0.56, 0.11, 0.33, 0.44,
0.56, 0.44, 0.33, 0.22, 0.33, 1, 1, 1, 1, 1, 1, 0.11, 0.11, 0.11,
0, 0, 0, 1, 1, 1, 1, 1, 0.67, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0.44, 0, 0.33, 0.22, 0, 0, 0, 1, 0.56, 0.67, 0.89, 0.89, 0.78,
0.67, 0.44, 0.33, 0, 0.33, 0, 1, 1, 0.89, 0.78, 0.67, 0.44, 0.33,
0, 0.22, 0, 0, 0, 1, 1, 1, 1, 1, 0.78, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.67, 0.44, 0.56, 0.22, 0.33, 0.44,
0.56, 0.56, 0.22, 0.44, 0.44, 0.33, 1, 1, 1, 1, 1, 1, 0.56, 0.11,
0, 0, 0, 0, 1, 1, 1, 1, 0.44, 0.44, 0.33, 0, 0, 0, 0, 0, 1, 1,
0.89, 0.67, 0.33, 0.33, 0.56, 0.22, 0.11, 0.11, 0, 0, 1, 1, 1,
1, 1, 0.78, 0.22, 0, 0, 0, 0, 0)), row.names = c(NA, -360L), groups = structure(list(
id = structure(1:30, .Label = c("P001", "P002", "P003", "P004",
"P005", "P006", "P007", "P008", "P009", "P010", "P011", "P012",
"P013", "P014", "P015", "P016", "P017", "P018", "P019", "P020",
"P021", "P022", "P023", "P024", "P025", "P026", "P027", "P028",
"P029", "P030"), class = "factor"), .rows = structure(list(
1:12, 13:24, 25:36, 37:48, 49:60, 61:72, 73:84, 85:96,
97:108, 109:120, 121:132, 133:144, 145:156, 157:168,
169:180, 181:192, 193:204, 205:216, 217:228, 229:240,
241:252, 253:264, 265:276, 277:288, 289:300, 301:312,
313:324, 325:336, 337:348, 349:360), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
data_averaged <- data %>%
group_by(card_self) %>%
summarise(percent = mean(percent))
ggplot(data, aes(x = card_self, y = percent)) +
geom_line(aes(group=id, color=id, size=1, alpha=0.8)) +
geom_line(data = data_averaged, aes(size=10, alpha=0.9)) +
guides(color='none', alpha='none', size='none')
Update: Important comment from teunbrand:
"In addition, I'd like to add that the reason the group aesthetic should be added is because discrete x-axes automatically group observations"
Just add group=1 to ggplot()
The issue is that the second geom_line is not grouped. The data points must be grouped to connect correctly.
ggplot(data, aes(x = card_self, y = percent, group=1)) +
geom_line(aes(group=id, color=id, size=1, alpha=0.8)) +
geom_line(data = data_averaged, aes(size=10, alpha=0.9)) +
guides(color='none', alpha='none', size='none')
# also works:
ggplot(data, aes(x = card_self, y = percent)) +
geom_line(aes(group=id, color=id, size=1, alpha=0.8)) +
geom_line(data = data_averaged, aes(size=10, alpha=0.9, group=1)) +
guides(color='none', alpha='none', size='none')

r geom_bar reorder layers of bars by values

I have produced a bar chart that shows cumulative totals over periods of months for various programs using the following data structure and code:
library(dplyr)
data_totals <- data_long %>%
group_by(Period, Program) %>%
arrange(Period, Program) %>%
ungroup() %>%
group_by(Program) %>%
mutate(Running_Total = cumsum(Value))
dput(data_totals)
structure(list(Period = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
8L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L), .Label = c("2018-04", "2018-05",
"2018-06", "2018-07", "2018-08", "2018-09", "2018-10", "2018-11",
"2018-12", "2019-01", "2019-02", "2019-03", "Apr-Mar 2019"), class = "factor"),
Program = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L), .Label = c("A",
"B", "C", "D",
"E"), class = "factor"), Value = c(5597,
0, 0, 0, 1544, 0, 0, 0, 0, 1544, 0, 0, 0, 0, 1544, 0, 0,
850, 0, 1544, 0, 0, 0, 0, 1544, 0, 0, 0, 0, 1544, 0, 0, 0,
0, 1544, 0, 0, 0, 0, 1544, 0, 0, 0, 0, 1544, 0, 0, 0, 0,
1544, 0, 0, 0, 0, 1544, 0, 0, 0, 0, 1544), Running_Total = c(5597,
0, 0, 0, 1544, 5597, 0, 0, 0, 3088, 5597, 0, 0, 0, 4632,
5597, 0, 850, 0, 6176, 5597, 0, 850, 0, 7720, 5597, 0, 850,
0, 9264, 5597, 0, 850, 0, 10808, 5597, 0, 850, 0, 12352,
5597, 0, 850, 0, 13896, 5597, 0, 850, 0, 15440, 5597, 0,
850, 0, 16984, 5597, 0, 850, 0, 18528)), .Names = c("Period",
"Program", "Value", "Running_Total"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -60L), vars = "Program", labels = structure(list(
Program = structure(1:5, .Label = c("A",
"B", "C", "D",
"E"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L), vars = "Program", drop = TRUE, .Names = "Program"), indices = list(
c(0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L
), c(1L, 6L, 11L, 16L, 21L, 26L, 31L, 36L, 41L, 46L, 51L,
56L), c(2L, 7L, 12L, 17L, 22L, 27L, 32L, 37L, 42L, 47L, 52L,
57L), c(3L, 8L, 13L, 18L, 23L, 28L, 33L, 38L, 43L, 48L, 53L,
58L), c(4L, 9L, 14L, 19L, 24L, 29L, 34L, 39L, 44L, 49L, 54L,
59L)), drop = TRUE, group_sizes = c(12L, 12L, 12L, 12L, 12L
), biggest_group_size = 12L)
# reorder the groups descending so that the lowest total will be on layers from front to back
reorder(data_totals$Program, -data_totals$Running_Total)
ggplot(data = data_totals, aes(x = Period, y = Running_Total)) +
geom_bar(aes(color = Program, group = Program, fill = Program),
stat = "identity", position = "identity", alpha = 1.0)
It works in that it creates the graph with all the proper data, but the smaller Running_Totals are obscured by the larger ones.
I get the following error message as well:
Warning message:
The plyr::rename operation has created duplicates for the following name(s): (`colour`)
Even though I do not have the plyr package loaded.
I can see all the Running_Totals if I set the alpha to 0.5
Running_Total for each Program by Period, alpha = 0.5:
How can I get the layers ordered so that the smallest values are on the front most layers working back toward the highest values?
The way I was trying to represent the data in the original question was flawed.
There is no advantage to having the Program with the maximum value for each Period be the top of the bar.
A more illustrative solution is to have a stacked bar, with labels indicating the contribution of each Program to the overall value of each Period:
ggplot(data = data_totals[which(data_totals$Running_Total > 0),], aes(x = Period, y = Running_Total, fill = Program)) +
geom_bar(aes(color = Program, group = Program, fill = Program), stat = "identity", position = "stack", alpha = 1.0) +
geom_text(aes(label = Running_Total), position = position_stack(vjust = 0.5))
I used [which(data_totals$Running_Total > 0),] to eliminate any "0" bars and labels.

Different versions of R, lme4 and OS X give different fixed-effects significance results in glmer

I am running a logit mixed-effects model using glmer() in package lme4.
The experiment used a within-subjects within-items design with Subjects and Items as crossed random effects.
My problem: different versions of R and lme4 (run on different OS X) produce different standard errors estimates for the fixed effects, and consequently, different significance results.
Here is a subset of my data (data from the last two subjects):
structure(list(SubjN = c(87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L,
87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L, 87L,
87L, 87L, 87L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L,
88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L,
88L), Items = structure(c(3L, 10L, 11L, 5L, 1L, 12L, 2L, 6L,
9L, 6L, 3L, 4L, 8L, 11L, 12L, 7L, 8L, 2L, 7L, 10L, 9L, 5L, 1L,
4L, 10L, 3L, 5L, 11L, 12L, 1L, 2L, 6L, 9L, 6L, 3L, 4L, 8L, 11L,
12L, 7L, 2L, 8L, 10L, 7L, 9L, 5L, 1L, 4L), .Label = c("a", "c",
"k", "f", "g", "i", "d", "l", "e", "j", "b", "h"), class = "factor"),
IV1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("N", "L", "P"
), class = "factor"), DV = c(0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
IV1.h = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), contrasts = structure(c(-1,
0.5, 0.5, 0, -0.5, 0.5), .Dim = c(3L, 2L), .Dimnames = list(
c("N", "L", "P"), c("N_vs_L&P", "L_vs_P"))), .Label = c("N",
"L", "P"), class = "factor"), N_vs_LP = c(-1, -1, -1, -1,
-1, -1, -1, -1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, -1, -1, -1, -1, -1, -1,
-1, -1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
0.5, 0.5, 0.5, 0.5, 0.5, 0.5), L_vs_P = c(0, 0, 0, 0, 0,
0, 0, 0, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0, 0,
0, 0, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, 0.5,
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5)), .Names = c("SubjN",
"Items", "IV1", "DV", "IV1.h", "N_vs_LP", "L_vs_P"), row.names = c("3099",
"3100", "3101", "3102", "3103", "3104", "3119", "3120", "3107",
"3108", "3109", "3110", "3097", "3098", "3105", "3106", "3115",
"3116", "3117", "3118", "3111", "3112", "3113", "3114", "3147",
"3148", "3149", "3150", "3151", "3152", "3167", "3168", "3155",
"3156", "3157", "3158", "3145", "3146", "3153", "3154", "3163",
"3164", "3165", "3166", "3159", "3160", "3161", "3162"), class = "data.frame")
Each subject was tested on 24 trials on 3 different conditions (factor IV1, levels: N, L, P).
I recorded whether they produced a target linguistic structure (DV == 1) or not (DV == 0).
In the analysis, I only included those subjects who produced the target structure at least one.
Nonetheless, most of them produced the target structure only on very few occasion. This is the proportion of DV == 1 produced by each subject in each condition:
library(plyr)
#dput(ddply(mydata, .(SubjN, IV1), summarise, l = length(DV), y = round(mean(DV),2)))
structure(list(SubjN = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L,
13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 17L, 17L,
18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 21L, 22L,
22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L, 26L, 26L,
26L, 27L, 27L, 27L, 28L, 28L, 28L, 29L, 29L, 29L, 30L, 30L, 30L,
31L, 31L, 31L, 32L, 32L, 32L, 33L, 33L, 33L, 34L, 34L, 34L, 35L,
35L, 35L, 36L, 36L, 36L, 37L, 37L, 37L, 38L, 38L, 38L, 39L, 39L,
39L, 40L, 40L, 40L, 41L, 41L, 41L, 42L, 42L, 42L, 43L, 43L, 43L,
44L, 44L, 44L, 45L, 45L, 45L, 46L, 46L, 46L, 47L, 47L, 47L, 48L,
48L, 48L, 49L, 49L, 49L, 50L, 50L, 50L, 51L, 51L, 51L, 52L, 52L,
52L, 53L, 53L, 53L, 54L, 54L, 54L, 55L, 55L, 55L, 56L, 56L, 56L,
57L, 57L, 57L, 58L, 58L, 58L, 59L, 59L, 59L, 60L, 60L, 60L, 61L,
61L, 61L, 62L, 62L, 62L, 63L, 63L, 63L, 64L, 64L, 64L, 65L, 65L,
65L, 66L, 66L, 66L, 67L, 67L, 67L, 68L, 68L, 68L, 69L, 69L, 69L,
70L, 70L, 70L, 71L, 71L, 71L, 72L, 72L, 72L, 73L, 73L, 73L, 74L,
74L, 74L, 75L, 75L, 75L, 76L, 76L, 76L, 77L, 77L, 77L, 78L, 78L,
78L, 79L, 79L, 79L, 80L, 80L, 80L, 81L, 81L, 81L, 82L, 82L, 82L,
83L, 83L, 83L, 84L, 84L, 84L, 85L, 85L, 85L, 86L, 86L, 86L, 87L,
87L, 87L, 88L, 88L, 88L), IV1 = structure(c(1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L), .Label = c("N", "L", "P"), class = "factor"), l = c(8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 8L, 7L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L,
7L, 8L, 6L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 8L, 8L, 7L, 7L, 8L, 7L, 8L,
8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 8L, 4L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L,
8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L,
8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 8L, 7L, 8L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L), y = c(1, 0.88, 1, 0.5, 0.25, 0.62,
0, 0, 0.25, 0, 0.25, 0, 0.12, 0, 0, 0, 0.12, 0, 0, 0.12, 0.12,
0, 0, 0.12, 0.38, 0, 0.25, 0, 0.12, 0, 0.12, 0, 0.25, 0, 0, 0.12,
0.5, 0.25, 0.5, 0, 0, 0.12, 0, 0.25, 0.12, 0, 0, 0.12, 0, 0.12,
0, 0, 0.12, 0.12, 0.12, 0.62, 0, 0, 0.5, 0.25, 1, 0.88, 1, 0,
0, 0.12, 0, 0.12, 0.12, 0.12, 0.12, 0, 0.62, 0.62, 0.38, 0.5,
0.88, 0.12, 0.12, 0, 0, 0.12, 0.12, 0, 0, 0.12, 0, 0, 0.12, 0,
0, 0.12, 0, 0, 0.25, 0, 0, 0.14, 0, 0.5, 0.57, 0.29, 0, 0.12,
0, 0, 0.12, 0, 0.25, 0.5, 0.25, 0, 0.12, 0.12, 0.25, 0, 0.38,
0, 0, 0.12, 0, 0, 1, 0.25, 0.12, 0.25, 0, 0.12, 0.12, 0, 0, 0.12,
0, 0, 0.12, 0.12, 0, 0, 0.12, 0, 0.14, 0.14, 0.12, 0, 0.12, 0,
0, 0.12, 0.12, 0, 1, 0.88, 1, 0, 0.12, 0, 0.12, 0, 0, 0.12, 0,
0.12, 0, 0, 0.12, 0.12, 0.12, 0.12, 1, 1, 1, 0.12, 0, 0, 0.12,
0.38, 0, 0, 0.12, 0, 0, 0, 0.5, 0.5, 0, 0.25, 0, 0.12, 0.29,
0, 0, 0.38, 0, 0, 0.62, 0.5, 0, 0.12, 0, 0.12, 0.12, 0.25, 0.12,
0.25, 0.12, 0, 0.12, 0, 0, 0.12, 0, 0, 0.12, 0, 0.12, 0.12, 0,
0.12, 0.12, 0, 0, 0.12, 0.12, 0.12, 0, 0.38, 0.12, 0.57, 0, 0.12,
0, 0, 0.12, 0, 0, 0.12, 0, 0, 0.12, 0.14, 0.88, 0.88, 0.86, 0,
0, 0.14, 0, 0.12, 0.14, 0, 0.12, 0, 0, 0, 0.12, 0, 0, 0.12, 0.38,
0, 0, 0.5, 0.12, 0)), .Names = c("SubjN", "IV1", "l", "y"), row.names = c(NA,
-264L), class = "data.frame")
I run the following model including IV1 as fixed effect with helmert-contrast coding;
first contrast: N vs. L & P, second contrast: L vs. P.
m1 <- glmer(DV ~ IV1.h + (1 + IV1.h|SubjN) + (1|Items) + (0 + N_vs_LP|Items) + (0 + L_vs_P|Items), family ='binomial', mydata)
The model does not allow for the correlation between the by-Items random variables (I did this by creating separate slopes for the two contrasts), since when correlation was allowed they were perfectly correlated (which I interpreted as a sign of over-parametrization).
1) Results using
os x 10.8.5 mountain lion
R version 3.0.2 (2013-09-25)
lme4_1.0-5
(the original analysis I run)
Generalized linear mixed model fit by maximum likelihood ['glmerMod']
Family: binomial ( logit )
Formula: DV ~ IV1.h + (1 + N_vs_LP + L_vs_P | SubjN) + (1 | Items) + (0 + N_vs_LP | Items) + (0 + L_vs_P | Items)
Data: mydata
AIC BIC logLik deviance
1492.5408 1560.2050 -734.2704 1468.5408
Random effects:
Groups Name Variance Std.Dev. Corr
SubjN (Intercept) 2.3885505 1.54549
N_vs_LP 0.4394195 0.66289 -0.69
L_vs_P 1.9287559 1.38880 0.04 0.08
Items (Intercept) 0.0531518 0.23055
Items.1 N_vs_LP 0.0001950 0.01396
Items.2 L_vs_P 0.0003619 0.01902
Number of obs: 2077, groups: SubjN, 88; Items, 12
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.2998 0.1964 -11.710 < 2e-16 ***
IV1.hN_vs_L&P 0.3704 0.1378 2.689 0.00717 **
IV1.hL_vs_P 0.2060 0.2320 0.888 0.37459
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) IV1.N_
IV1.hN_vs_L&P -0.388
IV1.hL_vs_P 0.014 0.019
2) Results using:
OS X 10.9.4 Mavericks
R version 3.1.1 (2014-07-10)
lme4_1.1-7
optimizer 'bobyqa'
Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) ['glmerMod']
Family: binomial ( logit )
Formula: DV ~ IV1.h + (1 + N_vs_LP + L_vs_P | SubjN) + (1 | Items) + (0 +
N_vs_LP | Items) + (0 + L_vs_P | Items)
Data: mydata
Control: glmerControl(optimizer = "bobyqa")
AIC BIC logLik deviance df.resid
1492.5 1560.2 -734.3 1468.5 2065
Scaled residuals:
Min 1Q Median 3Q Max
-2.4174 -0.3364 -0.2595 -0.1706 4.6028
Random effects:
Groups Name Variance Std.Dev. Corr
SubjN (Intercept) 2.38791 1.5453
N_vs_LP 0.43935 0.6628 -0.69
L_vs_P 1.92629 1.3879 0.04 0.07
Items (Intercept) 0.05319 0.2306
Items.1 N_vs_LP 0.00000 0.0000
Items.2 L_vs_P 0.00000 0.0000
Number of obs: 2077, groups: SubjN, 88; Items, 12
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.2998 0.2095 -10.975 <2e-16 ***
IV1.hN_vs_L&P 0.3703 0.1892 1.958 0.0503 .
IV1.hL_vs_P 0.2063 0.2679 0.770 0.4413
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) IV1.N_
IV1.hN__L&P -0.379
IV1.hL_vs_P -0.001 0.003
I really don't know which outcome I should trust. Any help would be very much appreciated.
Ps. Sorry if something is not clear - it's my first post :)
Thanks very much!
From lme4's NEWS file, for version 1.1-4
Standard errors of fixed effects are now computed from the approximate Hessian by default (see the use.hessian argument in vcov.merMod); this gives better (correct) answers when the estimates of the random- and fixed-effect parameters are correlated (Github #47)
The description of the problem is here
You should be able to retrieve the old standard errors from the newer (1.1-7) model by sqrt(diag(vcov(fitted_model,use.hessian=FALSE))), but the new version is more likely to be correct.
For more precise confidence intervals/p values, you can do a likelihood ratio test (use anova to compare nested models) and/or compute the profile confidence intervals with confint(fitted_model,which="beta_").

Formatting x-axis with histogram in R

I want x- axis from 1 to 20 and y-axis from 1 to 6.
My data:
structure(list(HEI.ID = structure(c(12L, 9L, 14L, 19L, 20L, 1L,
7L, 5L, 11L, 3L, 10L, 18L, 2L, 8L, 6L, 15L, 13L, 17L, 4L, 16L
), .Label = c("BF", "CC", "DC", "ER", "IM", "MC", "ME ",
"MM", "MO", "OC", "OM", "OP", "SB", "SD", "SH", "SL", "SN", "TH",
"UN", "WS"), class = "factor"), X2007 = c(18L, 14L, 15L, 20L,
12L, 6L, 17L, 2L, 4L, 11L, 16L, 1L, 9L, 8L, 13L, 4L, 10L, 6L,
3L, 19L), X2008 = c(20L, 9L, 16L, 18L, 8L, 17L, 15L, 6L, 3L,
14L, 19L, 1L, 2L, 12L, 5L, 13L, 11L, 7L, 4L, 10L), X2009 = c(20L,
13L, 17L, 8L, 4L, 9L, 19L, 12L, 2L, 11L, 16L, 1L, 2L, 7L, 6L,
18L, 5L, 15L, 9L, 14L), X2010 = c(20L, 13L, 16L, 13L, 7L, 15L,
19L, 8L, 3L, 9L, 18L, 1L, 5L, 11L, 12L, 6L, 10L, 4L, 2L, 17L),
X2011 = c(20L, 2L, 16L, 14L, 6L, 10L, 17L, 8L, 3L, 15L, 19L,
1L, 4L, 18L, 13L, 11L, 8L, 12L, 4L, 7L), X2012 = c(20L, 12L,
19L, 13L, 8L, 14L, 15L, 10L, 11L, 9L, 17L, 2L, 7L, 18L, 5L,
16L, 3L, 4L, 6L, 1L)), .Names = c("HEI.ID", "X2007", "X2008",
"X2009", "X2010", "X2011", "X2012"), row.names = c(NA, -20L), class = "data.frame")
I use the following commands to draw histograms:
par(mfrow = c(3,4))
for(i in 1:20){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlab = STOF[i,1],cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1)
}
But after using above commands, I get different number in x- axis and y-axis.
I don't understand what your plot would looks like. It's not clear from your question and data provided.
I've tried to plot it. Please comment if you think it's the way to go.
Considering dt is your data.frame
library(reshape)
dt <- melt(dt)
library(ggplot2)
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt) +
geom_bar(stat = 'identity')
or
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt1) +
geom_bar(stat = 'identity') +
facet_grid(variable ~.)
You could use xlim and ylim parameters in the hist function and control the axes using
axis:
par(mfrow = c(3,4))
for(i in 1:12){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlim=c(0, 21), ylim=c(0,6), xaxt='n', yaxt='n')
axis(1, at=c(0, 10, 20))
axis(2, at=0:6)
}
Do you really want your y-axis to go from 1 to 6? This will cut off parts of the bars.
Also, you iterate over all 20 rows for a grid with 12 plots. The code above gives the following plot:

r data.frame create new variable

I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:
aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)
but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.
Any help on the issue would be very much appreciated!
Here is a reproducible (subset!) sample:
mydata<- structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L,
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L,
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L,
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L,
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L,
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L,
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L,
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L,
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L,
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L,
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L,
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L,
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L,
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L,
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L,
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L,
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L,
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L,
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L,
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L,
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L,
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L,
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L,
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L,
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L,
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L,
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L,
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L,
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"),
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L,
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L,
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L,
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L,
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L,
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L,
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L,
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L,
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L,
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L,
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L,
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L,
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L,
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L,
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L,
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L,
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L,
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L,
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L,
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L,
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L,
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L,
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L,
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L,
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998",
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998",
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999",
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999",
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000",
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000",
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001",
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001",
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002",
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003",
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003",
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004",
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004",
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005",
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005",
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006",
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006",
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007",
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008",
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008",
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009",
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009",
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010",
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010",
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011",
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011",
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012",
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998",
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002",
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006",
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010",
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999",
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003",
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007",
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011",
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001",
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006",
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011",
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943,
59, 201, 188, 10859, 93,
1494, 60, 1001, 1000, 689, 675, 934, 51,
1240, 165, 13, 0, 14, 2, 2,
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0,
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0,
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5,
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839,
4303, 100, 1727, 663, 694, 1210, 623,
1261, 772, 697, 490, 1031, 490, 956, 704,
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528,
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3,
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0,
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK",
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L,
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L,
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L,
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L,
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L,
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L,
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L,
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L,
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L,
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L,
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L,
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L,
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L,
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L,
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L,
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L,
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L,
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L,
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L,
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L,
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L,
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L,
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L,
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L,
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L,
99001L, 99501L), class = "data.frame")
Try this instead:
library(data.table)
dt = data.table(mydata)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
It seems like I need to look into updating my concat.split functions!
The version of the function that you tried to use makes use of read.table, which does tend to struggle with large datasets. I had used read.table because it has a convenient text argument that lets you specify a column in a data.frame as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)
As far as I can tell, fread from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split with fread instead of read.table.
Here's the concept:
Write the variable that needs to be split to a new file.
Use the blazing fast fread to read it back in.
Wait for fread to get a text argument somewhere down the line?
Here's that concept as a function (updated with edits as per #eddi's suggestions in the comments):
csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
if (!is.data.table(dataset)) dataset <- data.table(dataset)
if (sep == ".") {
dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
sep <- "|"
}
if (!is.character(dataset[[splitcol]])) {
dataset[, (splitcol) := as.character(get(splitcol))]
}
x <- tempfile()
writeLines(dataset[[splitcol]], x)
Split <- fread(x, sep=sep, header = FALSE)
setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
if (isTRUE(drop)) dataset[, (splitcol) := NULL]
cbind(dataset, Split)
}
Here's the function in action:
## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]
csDataTable(out, "VARIABLE", "_")
# PROVINCE AGE5 ZONA91OK VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
# 1: 1 10-14 101 SPAIN_1998 614 SPAIN 1998
# 2: 4 30-34 4079 SPAIN_1998 1943 SPAIN 1998
# 3: 7 50-54 712 SPAIN_1998 59 SPAIN 1998
# 4: 8 40-44 8205 SPAIN_1998 201 SPAIN 1998
# 5: 11 35-39 11022 SPAIN_1998 188 SPAIN 1998
# ---
# 1499996: 44 35-39 4401 ROE_1999 0 ROE 1999
# 1499997: 46 35-39 4621 ROE_1999 0 ROE 1999
# 1499998: 49 10-14 490499 ROE_1999 0 ROE 1999
# 1499999: 3 30-34 3059 MAGREB_1999 5 MAGREB 1999
# 1500000: 6 40-44 6153 MAGREB_1999 2 MAGREB 1999
In this test, at least, the solution fares much better than I expected:
subFun <- function() {
dt = data.table(out)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
}
freadFun <- function() {
csDataTable(out, "VARIABLE", "_")
}
library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# subFun() 3.814174 4.244820 4.273834 4.345358 4.480520 20
# freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886 20
Here is some solution with splitting factor labels
VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
as.data.frame(do.call("rbind",
strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])

Resources