r data.frame create new variable - r

I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:
aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)
but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.
Any help on the issue would be very much appreciated!
Here is a reproducible (subset!) sample:
mydata<- structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L,
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L,
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L,
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L,
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L,
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L,
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L,
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L,
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L,
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L,
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L,
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L,
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L,
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L,
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L,
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L,
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L,
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L,
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L,
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L,
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L,
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L,
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L,
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L,
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L,
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L,
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L,
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L,
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"),
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L,
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L,
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L,
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L,
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L,
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L,
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L,
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L,
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L,
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L,
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L,
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L,
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L,
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L,
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L,
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L,
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L,
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L,
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L,
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L,
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L,
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L,
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L,
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L,
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998",
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998",
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999",
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999",
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000",
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000",
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001",
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001",
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002",
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003",
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003",
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004",
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004",
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005",
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005",
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006",
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006",
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007",
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008",
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008",
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009",
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009",
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010",
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010",
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011",
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011",
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012",
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998",
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002",
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006",
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010",
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999",
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003",
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007",
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011",
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001",
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006",
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011",
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943,
59, 201, 188, 10859, 93,
1494, 60, 1001, 1000, 689, 675, 934, 51,
1240, 165, 13, 0, 14, 2, 2,
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0,
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0,
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5,
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839,
4303, 100, 1727, 663, 694, 1210, 623,
1261, 772, 697, 490, 1031, 490, 956, 704,
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528,
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3,
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0,
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK",
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L,
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L,
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L,
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L,
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L,
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L,
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L,
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L,
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L,
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L,
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L,
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L,
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L,
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L,
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L,
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L,
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L,
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L,
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L,
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L,
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L,
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L,
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L,
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L,
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L,
99001L, 99501L), class = "data.frame")

Try this instead:
library(data.table)
dt = data.table(mydata)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]

It seems like I need to look into updating my concat.split functions!
The version of the function that you tried to use makes use of read.table, which does tend to struggle with large datasets. I had used read.table because it has a convenient text argument that lets you specify a column in a data.frame as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)
As far as I can tell, fread from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split with fread instead of read.table.
Here's the concept:
Write the variable that needs to be split to a new file.
Use the blazing fast fread to read it back in.
Wait for fread to get a text argument somewhere down the line?
Here's that concept as a function (updated with edits as per #eddi's suggestions in the comments):
csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
if (!is.data.table(dataset)) dataset <- data.table(dataset)
if (sep == ".") {
dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
sep <- "|"
}
if (!is.character(dataset[[splitcol]])) {
dataset[, (splitcol) := as.character(get(splitcol))]
}
x <- tempfile()
writeLines(dataset[[splitcol]], x)
Split <- fread(x, sep=sep, header = FALSE)
setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
if (isTRUE(drop)) dataset[, (splitcol) := NULL]
cbind(dataset, Split)
}
Here's the function in action:
## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]
csDataTable(out, "VARIABLE", "_")
# PROVINCE AGE5 ZONA91OK VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
# 1: 1 10-14 101 SPAIN_1998 614 SPAIN 1998
# 2: 4 30-34 4079 SPAIN_1998 1943 SPAIN 1998
# 3: 7 50-54 712 SPAIN_1998 59 SPAIN 1998
# 4: 8 40-44 8205 SPAIN_1998 201 SPAIN 1998
# 5: 11 35-39 11022 SPAIN_1998 188 SPAIN 1998
# ---
# 1499996: 44 35-39 4401 ROE_1999 0 ROE 1999
# 1499997: 46 35-39 4621 ROE_1999 0 ROE 1999
# 1499998: 49 10-14 490499 ROE_1999 0 ROE 1999
# 1499999: 3 30-34 3059 MAGREB_1999 5 MAGREB 1999
# 1500000: 6 40-44 6153 MAGREB_1999 2 MAGREB 1999
In this test, at least, the solution fares much better than I expected:
subFun <- function() {
dt = data.table(out)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
}
freadFun <- function() {
csDataTable(out, "VARIABLE", "_")
}
library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# subFun() 3.814174 4.244820 4.273834 4.345358 4.480520 20
# freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886 20

Here is some solution with splitting factor labels
VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
as.data.frame(do.call("rbind",
strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])

Related

How do I sum a column based on another column?

Assuming that the dataframe is stored as fruit, and is in the following format:
State Fruit Category Fruit Type Gross Value
ACT CitrusFruit Mandarins $4,500,000
ACT CitrusFruit Oranges
NSW PomeFruit Apple $139,130,203.50
NSW Grapes Wine Production $50,000,000
NSW OrchardStoneFruit Avocados $10,031,123
QLD CitrusFruit Oranges
How would I sum the gross value, based on the State - while excluding blank values. But at the same time, the gross value of each state should be summed, rather than displayed separately for CitrusFruit, PomeFruit, etc.
I have tried to use the
library(plyr)
counts
method to no avail.
Any help would be greatly appreciated.
EDIT:
I have tried to use the following method:
library(dplyr)
fruit %>%
group_by(State) %>%
summarise(Gross = sum(Gross))
However, I am getting an error that says:
Evaluation Error: 'sum' not meaningful for factors.
EDIT:
Output from dput(fruit)
structure(list(State = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L), .Label = c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC",
"WA"), class = "factor"), Fruit.Category = structure(c(6L, 6L,
6L, 8L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L), .Label = c(" Grapes ", " OrchardStoneFruit ", " OtherFruit ",
" PomeFruit ", " CitrusFruit ", " CitrusFruit ", " Grapes ",
" Grapes ", " OrchardStoneFruit ", " OtherFruit ", " PomeFruit "
), class = "factor"), Fruit.Type = structure(c(5L, 8L, 13L, 18L,
31L, 2L, 4L, 6L, 7L, 9L, 14L, 17L, 3L, 11L, 12L, 15L, 1L, 10L,
16L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L,
13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 18L, 31L, 18L, 31L,
18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 14L, 17L, 20L,
22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L,
20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L,
17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L,
14L, 17L, 20L, 22L, 24L, 25L, 27L, 15L, 21L, 29L, 30L, 15L, 21L,
29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L,
30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 16L, 19L, 28L, 16L,
19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L,
28L, 16L, 19L, 28L), .Label = c(" Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ",
" Nectarines ", " Oranges ", " Peaches ", " Pears ",
" Pineapples ", " Strawberries ", " AllOtherCitrusFruit ",
" AllOtherOrchardFruit ", " AllOtherOtherFruit ", " AllOtherPomeFruit ",
" AllOtherStoneFruit ", " AllOtherUses ", " Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ", " Nectarines ",
" Oranges ", " Peaches ", " Pears ", " Pineapples ", " Strawberries ",
" WineProduction "), class = "factor"), Gross.Value = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 60L, 97L, 23L, 104L, 1L, 1L, 56L, 98L, 36L, 101L, 68L,
11L, 1L, 1L, 1L, 91L, 96L, 57L, 99L, 92L, 21L, 71L, 29L, 48L,
1L, 76L, 51L, 46L, 58L, 1L, 34L, 37L, 14L, 22L, 70L, 18L, 59L,
28L, 32L, 41L, 83L, 61L, 69L, 30L, 1L, 1L, 26L, 1L, 1L, 25L,
35L, 19L, 2L, 80L, 9L, 8L, 7L, 102L, 47L, 31L, 1L, 85L, 75L,
1L, 88L, 93L, 52L, 1L, 66L, 50L, 100L, 43L, 89L, 95L, 2L, 82L,
65L, 5L, 24L, 94L, 33L, 64L, 10L, 90L, 78L, 84L, 62L, 3L, 86L,
20L, 73L, 1L, 38L, 67L, 72L, 15L, 63L, 1L, 1L, 39L, 17L, 1L,
1L, 16L, 40L, 1L, 1L, 103L, 79L, 49L, 1L, 44L, 6L, 105L, 53L,
1L, 1L, 1L, 1L, 81L, 54L, 27L, 87L, 13L, 1L, 55L, 106L, 4L, 42L,
12L, 45L, 77L, 74L), .Label = c("", "$0.00", "$1,025,861.63",
"$1,107,476.82", "$1,135,055.74", "$1,148,385.97", "$1,514,089.93",
"$1,539,762.85", "$1,565,234.83", "$10,469,580.98", "$100,622,922.20",
"$106,039,956.40", "$11,648,561.35", "$113,930,475.80", "$114,195,162.80",
"$12,169,338.44", "$12,492,792.64", "$12,843,528.01", "$120,877,197.60",
"$13,245.08", "$13,331,668.11", "$13,981,075.51", "$130,258,416.50",
"$14,203,578.43", "$14,697,408.09", "$15,085,825.24", "$15,196.71",
"$15,246,349.76", "$154,858,589.30", "$168,325.78", "$17,661,100.37",
"$18,278,371.16", "$188,414.59", "$19,896,312.15", "$2,370,402.03",
"$2,557,589.86", "$209,648,663.50", "$21,426,350.11", "$22,482,034.46",
"$23,929,331.35", "$238,668.61", "$249,675,376.10", "$26,669,599.23",
"$27,540,236.71", "$270,903.84", "$3,485,520.14", "$3,520,605.89",
"$3,659,706.68", "$3,829,198.67", "$301,644.66", "$301,976.25",
"$31,133,715.88", "$313,144.86", "$334,363.30", "$35,212,772.81",
"$37,927,507.70", "$38,989,343.33", "$385,858,491.60", "$4,447,813.26",
"$4,549,208.46", "$4,569,373.00", "$4,702.20", "$4,712,329.56",
"$4,995,833.14", "$40,133,037.39", "$40,481.05", "$435,712,531.70",
"$44,434,103.55", "$443,017.10", "$45,665,029.35", "$45,888,545.67",
"$46,638,011.92", "$47,589.51", "$5,793,841.42", "$5,854,982.37",
"$51,534,636.09", "$53,367,548.56", "$53,377,925.45", "$555,799.71",
"$57,522,144.94", "$57,930,562.37", "$58,316,912.75", "$6,170,170.78",
"$6,791,088.95", "$6,824,520.08", "$623,030.52", "$63,493,163.21",
"$664,237.23", "$7,066,407.60", "$7,168,380.92", "$7,364,245.36",
"$7,426,224.28", "$7,894.54", "$70,218,810.35", "$76,591,000.57",
"$8,596,626.45", "$8,713,417.54", "$85,876,834.41", "$873,748.40",
"$9,262,889.69", "$9,731,658.36", "$9,991,440.81", "$91,781,453.44",
"$92,299.72", "$95,677,012.68", "$983,780.33"), class = "factor")), class = "data.frame", row.names = c(NA,
-152L))
A couple of problems here:
You don't have Gross Value in your data, you have Gross.Value.
That column is factor, which is a more storage-efficient form of strings. Neither factor nor character can be summed. R knows nothing about accounting so the "$" means nothing to it in that context.
Try this:
library(dplyr)
someData %>%
mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", as.character(Gross.Value)))) %>%
group_by(State) %>%
summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE))
# # A tibble: 8 x 2
# State Gross.Value
# <fct> <dbl>
# 1 ACT 0
# 2 NSW 564400574.
# 3 NT 20133040.
# 4 QLD 1053007677.
# 5 SA 691850721.
# 6 TAS 112902970.
# 7 VIC 1069102796.
# 8 WA 281014929.
The only changes from my comment were (1) using the correct column name, and (2) adding na.rm=TRUE, since you have many blanks. This means you need to be careful how you use this data, as you now have biases and inaccuracies in your summary.
You should convert the factor to numeric and then sum. Here is the solution I came up with:
library(tidyverse)
##This line converts the factor into a numeric variable, by making it a character and then removing the commas and the dollar sign. Finally it converts to number
fruit$`Gross Value` <- as.numeric(str_replace_all(as.character(fruit$`Gross Value`),"\\$|\\,",""))
##Then you can run your sum function
fruit %>%
group_by(State) %>%
summarise(Gross = sum(`Gross Value`, na.rm = TRUE))

Calculate the percentile rank of a set of scores based on a population

I have a set of scores on 31 competencies for one individual
data <- data.frame(scores=sample(100,31,replace=T),row.names=paste0("X",1:31))
and I have a benchmark file for scores of 100 (or any number) individuals on the same 31 competencies
bmark <- data.frame(replicate(31,sample(0:100,100,rep=TRUE)))
I need to calculate the percentile rank of each competency score for the individual relative to the benchmark for that competency, the output should look something like this (random values, just to demonstrate the required format)
data <- data.frame(scores=sample(100,31,replace=T),percentile=sample(100,31,replace=T),row.names=paste0("X",1:31))
I can calculate each score's percentile rank using a basic percentile formula:
length(bmark$X31[bmark$X31<data$scores[rownames(data)=="X31"]])/length(bmark$X31)*100
But I don't know how to make this work across the whole data set at once, so that each row in data$percentile has the appropriate value for the corresponding competency in the benchmark file. I could do this with a loop, but still not great with R vectorisation.
EDIT:
Here is a dput of my data frames for subsequent comments/questions below. Note that this is a snippet of the bmark data (only a few columns/'competencies'). Data is in long format (thus has all compentencies) and has a column identifier for subject (data$Split).
bmark
bmark <- structure(list(FinPerf = c(2.41333333333333, 2.94047619047619,
2.87538940809969, 2.16666666666667, 2.34146341463415, 2.425,
2.58148148148148, 2.97297297297297, 2.325, 2.52542372881356,
2.35593220338983, 1.69105691056911, 2.73493975903614, 2.01666666666667,
1.94010416666667, 1.94603174603175, 2.56666666666667, 2.48550724637681,
2.61846110878369, 2.15873015873016, 2.56565656565657, 2.23529411764706,
2.98974358974359, 3.09195402298851, 2.59289617486339, 2.44306418219462,
2.72345679012346, 2.85714285714286, 2.62962962962963, 2.90833333333333
), Opt = c(2.74, 2.83928571428571, 2.87383177570093, 2.56692506459948,
2.64634146341463, 2.6625, 2.53333333333333, 3.31081081081081,
2.4, 2.50847457627119, 2.55932203389831, 1.89024390243902, 2.60240963855422,
2.3, 2.3203125, 2.24761904761905, 2.85, 2.76086956521739, 2.78064516129032,
2.30952380952381, 2.65151515151515, 2.29411764705882, 3.06923076923077,
3.12931034482759, 2.64754098360656, 2.08695652173913, 2.41111111111111,
2.78571428571429, 2.88888888888889, 2.9625), SatExp = c(2.44,
2.58928571428571, 2.70093457943925, 2.11111111111111, 2.29268292682927,
2.525, 2.33888888888889, 3.10810810810811, 2.375, 2.73728813559322,
2.6864406779661, 1.91463414634146, 2.65060240963855, 2.225, 2.01171875,
1.94285714285714, 2.6, 2.51086956521739, 2.83225806451613, 2.28571428571429,
2.84848484848485, 2.02941176470588, 3.13076923076923, 3.22413793103448,
2.45491803278689, 2.5, 2.78518518518518, 2.85714285714286, 2.75925925925926,
2.975)), .Names = c("FinPerf", "Opt", "SatExp"), class = "data.frame", row.names = c(NA,
-30L))
And data
data <- structure(list(Area.short = structure(c(13L, 25L, 28L, 6L, 3L,
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L,
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L,
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L,
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L,
15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L,
21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L,
19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L,
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L,
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L,
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L,
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L,
15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L,
21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L,
19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L,
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L,
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L,
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L,
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L,
15L, 29L, 17L), .Label = c("ACHIEVEMENT", "Apprec", "Bal", "Belong",
"BrandPass", "Burn", "Care", "Comms", "Comp", "CONNECTION", "Consul",
"Fam", "FinPerf", "Forw", "FRANCHISE PARTNERSHIP", "Ful", "GENSAT",
"Harm", "Innov", "Integ", "LeadCom", "LEADERSHIP", "LIFESTYLE",
"MarkSup", "Opt", "Part", "PracSup", "SatExp", "SUPPORT", "Syst",
"Train"), class = "factor"), MM = c(22.8758169934641, 29.4117647058824,
7.84313725490196, -29.4117647058824, 11.7647058823529, -3.26797385620914,
7.84313725490196, 11.1111111111111, 15.6862745098039, 9.80392156862744,
9.80392156862744, 5.88235294117648, 33.3333333333333, 17.6470588235294,
13.7254901960784, 19.6078431372549, 11.7647058823529, 1.96078431372548,
13.7254901960784, 21.5686274509804, 15.6862745098039, 11.1111111111111,
21.5686274509804, 1.96078431372548, 20.4481792717087, -3.26797385620914,
10.1307189542484, 20.0980392156863, 10.9803921568627, 14.7058823529412,
11.7647058823529, 22.2222222222222, 16.6666666666667, 8.33333333333333,
-25, 0, -11.1111111111111, 0, -5.55555555555557, 16.6666666666667,
58.3333333333333, 16.6666666666667, 0, 25, 11.1111111111111,
0, 33.3333333333333, 8.33333333333333, -11.1111111111111, 11.1111111111111,
50, 0, 11.1111111111111, 16.6666666666667, 25, 16.6666666666667,
-9.25925925925926, 13.8888888888889, 10.4166666666667, 8.33333333333333,
19.4444444444444, 10.3448275862069, 100, 66.6666666666667, 66.6666666666667,
-33.3333333333333, 100, 55.5555555555556, 100, 55.5555555555556,
100, 33.3333333333333, 66.6666666666667, -11.1111111111111, 100,
77.7777777777778, 33.3333333333333, 66.6666666666667, 66.6666666666667,
11.1111111111111, 77.7777777777778, 66.6666666666667, 33.3333333333333,
33.3333333333333, 100, 33.3333333333333, 80.952380952381, 55.5555555555556,
44.4444444444444, 66.6666666666667, 53.3333333333333, 55.5555555555556,
57.4712643678161, 7.40740740740741, 33.3333333333333, -5.55555555555557,
-5.55555555555557, -5.55555555555557, -14.8148148148148, -22.2222222222222,
-3.70370370370369, -22.2222222222222, -27.7777777777778, -16.6666666666667,
7.40740740740741, 22.2222222222222, 7.40740740740741, 3.70370370370369,
5.55555555555557, 5.55555555555557, 14.8148148148148, -3.70370370370369,
5.55555555555557, 11.1111111111111, 0, 18.5185185185185, -11.1111111111111,
11.1111111111111, -12.3456790123457, -10.1851851851852, 9.72222222222223,
5.55555555555557, 5.55555555555557, 0.766283524904221, 25.9259259259259,
27.7777777777778, 11.1111111111111, -55.5555555555556, 22.2222222222222,
3.70370370370369, 27.7777777777778, 29.6296296296296, 38.8888888888889,
11.1111111111111, 22.2222222222222, 11.1111111111111, 38.8888888888889,
22.2222222222222, 29.6296296296296, 16.6666666666667, 11.1111111111111,
-3.70370370370369, 22.2222222222222, 11.1111111111111, 27.7777777777778,
18.5185185185185, 14.8148148148148, -5.55555555555557, 22.2222222222222,
0, 22.2222222222222, 29.1666666666667, 11.1111111111111, 13.8888888888889,
16.0919540229885, -6.17283950617284, 11.1111111111111, -18.5185185185185,
14.8148148148148, -18.5185185185185, -28.3950617283951, -44.4444444444444,
-16.0493827160494, -29.6296296296296, -25.9259259259259, -33.3333333333333,
3.70370370370369, -3.70370370370369, -18.5185185185185, -18.5185185185185,
-7.40740740740741, -14.8148148148148, -6.17283950617284, -28.3950617283951,
3.70370370370369, -3.70370370370369, -16.0493827160494, -11.1111111111111,
-7.40740740740741, -4.76190476190478, -20.1646090534979, -17.9012345679012,
-14.8148148148148, -14.8148148148148, -8.02469135802468, -13.6653895274585,
42.2222222222222, 53.3333333333333, 33.3333333333333, -86.6666666666667,
26.6666666666667, 2.22222222222221, 46.6666666666667, 20, 60,
46.6666666666667, 53.3333333333333, 6.66666666666667, 66.6666666666667,
51.1111111111111, 42.2222222222222, 46.6666666666667, 33.3333333333333,
11.1111111111111, 55.5555555555556, 33.3333333333333, 33.3333333333333,
33.3333333333333, 55.5555555555556, 13.3333333333333, 42.8571428571429,
-2.22222222222221, 33.3333333333333, 51.6666666666667, 36, 35.5555555555555,
32.4137931034483, 77.7777777777778, 44.4444444444444, 44.4444444444444,
-66.6666666666667, 77.7777777777778, 62.962962962963, 100, 77.7777777777778,
77.7777777777778, 55.5555555555556, 66.6666666666667, 11.1111111111111,
88.8888888888889, 70.3703703703704, 62.962962962963, 55.5555555555556,
55.5555555555556, 11.1111111111111, 70.3703703703704, 55.5555555555556,
44.4444444444444, 55.5555555555556, 62.962962962963, 11.1111111111111,
58.7301587301587, 45.679012345679, 55.5555555555556, 72.2222222222222,
46.6666666666667, 48.1481481481482, 53.639846743295), Split = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("Success4u", "NSW/ACT",
"QLD", "SA/WA", "VIC/TAS", "Type 1", "Type 2", "Type 3", "Franchise Sector Benchmark"
), class = "factor")), .Names = c("Area.short", "MM", "Split"
), row.names = c(NA, 248L), class = "data.frame")
Here's a way to get the percentile on each benchmark for one subject. We use the empirical cumulative distribution function to calculate percentiles:
# Fake data
set.seed(595)
dat <- data.frame(scores=sample(100,31,replace=T),row.names=paste0("X",1:31))
bmark <- data.frame(replicate(31,sample(100,100,rep=TRUE)))
# Get percentile on each benchmark for one subject
dat$percentile = mapply(function(ref, subj) {
ecdf(ref)(subj)*100
}, ref=bmark, subj=dat$scores)
dat
scores percentile
X1 28 25
X2 25 30
X3 91 92
...
X29 42 46
X30 76 71
X31 1 2
Here's a boxplot of the distribution of bmark for each competency, along with a red dot showing where the subject scored on each of them:
boxplot(bmark)
points(1:31, data$scores, pch=16, col="red")
If you have multiple subjects, you can get their percentiles all at once. We take the mapply code from above, which calculates percentiles for a single subject, and wrap it in sapply, which feeds the mapply code each subject in succession and returns all of the results in a single matrix:
# Scores on 31 benchmarks for 20 subjects. Each column is a subject.
set.seed(58)
subjects = as.data.frame(replicate(20, sample(100, 31, replace=TRUE)))
# Get percentile on each benchmark for each subject
percentile.score = sapply(subjects, function(s) {
mapply(function(ref, subj) {
ecdf(ref)(subj)*100
}, ref=bmark, subj=s)
})
percentile.score
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
X1 28 95 2 24 95 18 61 6 44 11 82 50 28 37 59 41 91 51 35 76
X2 20 49 62 3 24 24 54 23 50 50 60 95 25 63 57 78 72 89 79 66
X3 69 37 40 49 59 30 91 5 92 43 56 22 34 33 28 54 35 1 59 29
...
X29 34 42 10 76 40 48 13 36 76 13 88 91 99 31 13 76 93 42 31 91
X30 16 66 86 56 21 67 86 45 81 16 70 66 24 11 23 18 32 53 31 32
X31 81 81 52 2 3 32 64 36 33 39 92 100 80 2 44 63 59 2 34 99

"minimum count is not zero" error for zero inflated model

Here is the data of my regression :
y is the number of passengers at platform of the train station in each 2 minutes period while A1 to A17 are the number of passengers at 17 study areas on concourse. Time lag has already between considered by shifting the Xs.
Since sometimes, there will be no one waiting in the study areas on concourse, so excess zero occurs. I am planing to use zero inflated model. I have tried the code as shown between, but it said "minimum count is not zero" What does that mean and how can i solve it? I have done poisson and it's alright but zero inflated doesn't work.
> setwd('C:/Users/zuzymelody/Desktop')
> try<-read.csv('0inflated_2mins27peak.csv',header=TRUE)
> attach(try)
> names(try)
[1] "y" "A1" "A2" "A3" "A4" "A5" "A6" "A7" "A8" "A9" "A10" "A11"
[13] "A12" "A13" "A14" "A15" "A16" "A17"
> model1<-glm(y~A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12+A13+A14+A15+A16+A17,family="poisson")
> summary(model1)
Call:
glm(formula = y ~ A1 + A2 + A3 + A4 + A5 + A6 + A7 + A8 + A9 +
A10 + A11 + A12 + A13 + A14 + A15 + A16 + A17, family = "poisson")
Deviance Residuals:
Min 1Q Median 3Q Max
-7.8598 -3.4571 -0.3663 2.1867 12.5183
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.102009 0.164497 37.095 < 2e-16 ***
A1 -0.017555 0.003665 -4.790 1.66e-06 ***
A2 -0.026101 0.017569 -1.486 0.137371
A3 -0.179988 0.014976 -12.018 < 2e-16 ***
A4 -0.032584 0.007735 -4.213 2.52e-05 ***
A5 -0.019908 0.007014 -2.839 0.004532 **
A6 -0.044144 0.010266 -4.300 1.71e-05 ***
A7 0.049829 0.006518 7.645 2.09e-14 ***
A8 -0.080712 0.009819 -8.220 < 2e-16 ***
A9 0.007390 0.007105 1.040 0.298273
A10 0.041116 0.004085 10.065 < 2e-16 ***
A11 -0.041420 0.008418 -4.921 8.62e-07 ***
A12 -0.008241 0.007304 -1.128 0.259171
A13 -0.033161 0.008966 -3.699 0.000217 ***
A14 0.020818 0.005250 3.965 7.34e-05 ***
A15 -0.002995 0.006125 -0.489 0.624887
A16 -0.061997 0.017122 -3.621 0.000294 ***
A17 -0.025025 0.008391 -2.982 0.002860 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1137.71 on 29 degrees of freedom
Residual deviance: 599.74 on 12 degrees of freedom
AIC: 840.1
Number of Fisher Scoring iterations: 5
>with(model1, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
res.deviance df p
[1,] 599.7445 12 1.202013e-120
> require( pscl )
> Zip<-zeroinfl(model1,link="logit",dist="poisson")
**Error in zeroinfl(model1, link = "logit", dist = "poisson") :
invalid dependent variable, minimum count is not zero**
dput(try)
structure(list(y = c(156L, 74L, 221L, 207L, 168L, 36L, 128L,
208L, 99L, 117L, 228L, 211L, 341L, 173L, 196L, 310L, 112L, 203L,
104L, 183L, 325L, 143L, 218L, 166L, 218L, 127L, 136L, 38L, 102L,
34L), A1 = c(24L, 24L, 24L, 19L, 20L, 9L, 14L, 23L, 15L, 23L,
14L, 16L, 15L, 25L, 25L, 19L, 24L, 26L, 25L, 26L, 22L, 14L, 13L,
15L, 9L, 12L, 9L, 12L, 15L, 18L), A2 = c(2L, 4L, 0L, 3L, 0L,
1L, 1L, 2L, 1L, 2L, 0L, 2L, 2L, 0L, 1L, 1L, 3L, 3L, 2L, 2L, 3L,
2L, 3L, 5L, 4L, 3L, 4L, 1L, 2L, 1L), A3 = c(2L, 2L, 0L, 1L, 1L,
9L, 3L, 0L, 0L, 0L, 1L, 1L, 3L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 2L), A4 = c(15L, 11L, 6L, 7L,
10L, 10L, 5L, 4L, 5L, 7L, 9L, 9L, 4L, 6L, 6L, 13L, 9L, 13L, 9L,
10L, 6L, 6L, 7L, 6L, 10L, 9L, 10L, 7L, 9L, 2L), A5 = c(13L, 10L,
6L, 6L, 11L, 19L, 13L, 14L, 7L, 7L, 6L, 8L, 10L, 5L, 7L, 9L,
9L, 11L, 3L, 13L, 8L, 8L, 8L, 6L, 8L, 9L, 9L, 14L, 9L, 6L), A6 = c(9L,
10L, 9L, 9L, 4L, 7L, 7L, 12L, 11L, 11L, 12L, 8L, 6L, 7L, 8L,
5L, 9L, 6L, 5L, 6L, 9L, 11L, 6L, 6L, 8L, 9L, 4L, 11L, 10L, 7L
), A7 = c(21L, 16L, 13L, 13L, 4L, 9L, 12L, 13L, 12L, 12L, 12L,
6L, 7L, 6L, 6L, 4L, 5L, 9L, 8L, 7L, 9L, 12L, 10L, 7L, 8L, 12L,
14L, 2L, 6L, 6L), A8 = c(1L, 5L, 10L, 10L, 1L, 9L, 6L, 6L, 7L,
7L, 5L, 6L, 3L, 2L, 4L, 0L, 4L, 2L, 5L, 5L, 5L, 3L, 2L, 4L, 3L,
8L, 10L, 8L, 2L, 5L), A9 = c(8L, 9L, 10L, 10L, 12L, 19L, 10L,
6L, 6L, 6L, 0L, 6L, 8L, 10L, 2L, 3L, 6L, 2L, 2L, 6L, 5L, 2L,
4L, 1L, 3L, 7L, 7L, 4L, 4L, 2L), A10 = c(7L, 10L, 12L, 20L, 24L,
21L, 24L, 18L, 20L, 18L, 26L, 21L, 12L, 11L, 18L, 18L, 19L, 16L,
25L, 21L, 22L, 14L, 12L, 17L, 21L, 14L, 14L, 10L, 8L, 7L), A11 = c(0L,
2L, 1L, 4L, 2L, 1L, 1L, 1L, 13L, 10L, 12L, 5L, 2L, 0L, 5L, 1L,
4L, 4L, 3L, 3L, 1L, 1L, 3L, 3L, 5L, 5L, 2L, 10L, 3L, 4L), A12 = c(12L,
14L, 14L, 17L, 10L, 14L, 13L, 19L, 7L, 5L, 6L, 6L, 8L, 7L, 13L,
11L, 10L, 8L, 6L, 6L, 9L, 14L, 9L, 10L, 8L, 9L, 8L, 9L, 5L, 7L
), A13 = c(6L, 2L, 1L, 5L, 9L, 6L, 7L, 4L, 12L, 5L, 9L, 10L,
3L, 7L, 4L, 2L, 2L, 6L, 4L, 6L, 7L, 4L, 9L, 6L, 11L, 4L, 5L,
4L, 6L, 6L), A14 = c(14L, 13L, 16L, 11L, 8L, 6L, 9L, 13L, 14L,
14L, 9L, 8L, 12L, 11L, 13L, 11L, 18L, 15L, 20L, 21L, 17L, 18L,
18L, 18L, 25L, 20L, 12L, 9L, 8L, 8L), A15 = c(7L, 6L, 7L, 5L,
4L, 9L, 12L, 12L, 11L, 12L, 9L, 8L, 7L, 8L, 10L, 16L, 8L, 8L,
13L, 10L, 5L, 5L, 8L, 10L, 10L, 4L, 6L, 6L, 6L, 7L), A16 = c(2L,
1L, 3L, 3L, 1L, 2L, 3L, 2L, 3L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 2L,
1L, 3L, 4L, 2L, 5L, 4L, 8L, 5L, 2L, 1L, 2L, 2L, 2L), A17 = c(10L,
13L, 13L, 2L, 5L, 1L, 3L, 3L, 5L, 4L, 4L, 6L, 4L, 6L, 3L, 2L,
2L, 2L, 7L, 8L, 3L, 7L, 5L, 6L, 7L, 6L, 6L, 3L, 4L, 3L)), .Names = c("y",
"A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10",
"A11", "A12", "A13", "A14", "A15", "A16", "A17"), class = "data.frame", row.names = c(NA,
-30L))
above is the reproducible example. Sorry its my first time to post here, dont know the rule well
Your data frame does not contain a zero value in your dependent variable $y$:
min(mydata$y)
[1] 34
You'll need to have at least one $y = 0$.

How to compute standard errors for predicted data

I am trying to generate standard errors for predicted values. I use the below code to generate the predicted values but it fails to also give the standard errors.
ord6 <- veg$ord1-2
laimod.group = lmer(log(lai+0.000019) ~ ord6*plant_growth_form +
(1|plot.code) +
(1|species.code),
data=veg,
REML=FALSE)
summary(laimod.group)
new.ord6 <- c(-1,0,1,2,3,4,5,6,7)
new.plant_growth_form <- c("fern", "grass", "herb","herbaceous climber",
"herbaceous shrub", "moss", "tree sapling",
"undet", "woody climber", "woody shrub")
newdat <- expand.grid(
ord6=new.ord6,plant_growth_form=new.plant_growth_form)
newdat$pred <- predict(laimod.group,newdat, se.fit=TRUE, re.form=NA)
newdat
comment 1: laimod.group = final model selected after comparison of five models using lmer (package lme4)
comment 2: predictSE.mer requires package AICcmodavg
I did try the below code as an alternative but continue to receive the the following error message: Error in fam.link.mer(mod) : object 'out.link' not found
newdat$pred <- predictSE.mer(laimod.group, newdat, se.fit = TRUE, type = "response",
level = 0, print.matrix = FALSE)
Please see a reproducible subset of my data:
structure(list(plot.code = structure(c(1L, 2L, 3L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 6L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L,
9L, 9L, 10L, 11L, 11L, 11L, 11L, 11L, 12L, 13L, 14L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 16L, 17L, 18L, 19L, 19L, 19L, 20L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 27L, 28L, 28L, 28L, 28L, 28L, 29L,
29L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 31L, 32L, 34L, 34L, 34L,
34L, 34L, 34L, 34L, 35L, 36L, 36L, 36L, 37L, 38L, 39L, 39L, 39L,
40L, 40L, 33L, 33L), .Label = c("a100f1r", "a100m562r", "a10m562r",
"a1f56r", "a1m5r", "b100f177r", "b100m17r", "b100m5r", "c100f17r",
"c100f1r", "c100f5r", "d100m56r", "d100m5r", "d10f1r", "d10f5r",
"e100m17r(old)", "e100m1r", "e100m5r", "e10f177r", "e10f17r(old)",
"e10f5r(old)", "e1f17r", "e1f5r", "f100m177r", "f10f177r", "f10f17r",
"f1m177r", "f1m56r", "lf1f1r", "lf1f5r", "lf1m1r", "og100f5r",
"og10f1r", "og10m1r", "og10m5r", "op100f562r", "op100m177r",
"op10f1r", "op10f5r", "op10m562r"), class = "factor"), species.code = structure(c(69L,
59L, 67L, 69L, 20L, 44L, 28L, 32L, 31L, 7L, 13L, 63L, 69L, 52L,
69L, 14L, 54L, 57L, 42L, 9L, 62L, 10L, 22L, 69L, 35L, 49L, 38L,
11L, 41L, 39L, 16L, 40L, 69L, 32L, 33L, 41L, 22L, 69L, 43L, 4L,
68L, 48L, 6L, 34L, 53L, 3L, 15L, 30L, 13L, 31L, 66L, 64L, 38L,
46L, 61L, 29L, 61L, 27L, 8L, 41L, 55L, 58L, 23L, 25L, 18L, 45L,
26L, 13L, 65L, 12L, 51L, 50L, 60L, 47L, 17L, 5L, 19L, 61L, 1L,
37L, 13L, 36L, 13L, 2L, 11L, 24L, 44L, 13L, 49L, 56L, 21L), .Label = c("agetri",
"alb214", "annunk", "arimin", "baudip", "beg032", "blurip", "buc009",
"cal079", "calplu", "chrodo", "cishas", "clihir", "cos049", "cycari",
"cypunk", "cyr075", "cyrped1", "dae205", "dalpin", "diapla1",
"dio063", "diosum", "emison", "ery046", "eryborb", "fic119",
"ficmeg", "friacu", "graunk", "indunk", "jactom", "lauunk", "leeind",
"luvsar", "lyccer", "mac068", "melmal", "mergra", "miccra1",
"mikcor", "mitken", "nep127", "nepbis", "paldas", "palunk", "panunk",
"penlax", "poaunk", "pol019", "pop246", "ptecog", "ptesub1",
"rubcle", "ryphul", "scamac", "scl051", "sclsum", "selcup", "selfro",
"spa098", "sphste1", "stitrut", "tet055", "tetdie", "tetdie1",
"tetkor", "xanfla", "zinunk"), class = "factor"), plant_growth_form = structure(c(3L,
6L, 9L, 3L, 7L, 1L, 7L, 4L, 8L, 5L, 5L, 1L, 3L, 7L, 3L, 3L, 9L,
2L, 9L, 7L, 9L, 7L, 7L, 3L, 9L, 2L, 10L, 4L, 4L, 9L, 2L, 7L,
3L, 4L, 7L, 4L, 7L, 3L, 1L, 4L, 7L, 7L, 3L, 10L, 7L, 7L, 1L,
2L, 5L, 8L, 9L, 9L, 10L, 7L, 9L, 9L, 9L, 7L, 7L, 4L, 7L, 2L,
7L, 10L, 3L, 7L, 10L, 5L, 9L, 9L, 7L, 7L, 6L, 7L, 3L, 9L, 9L,
9L, 9L, 7L, 5L, 6L, 5L, 9L, 4L, 3L, 1L, 5L, 2L, 7L, 7L), .Label = c("fern",
"grass", "herb", "herbaceous climber", "herbaceous shrub", "moss",
"tree sapling", "undet", "woody climber", "woody shrub"), class = "factor"),
ord1 = c(9L, 5L, 7L, 9L, 4L, 4L, 5L, 5L, 5L, 2L, 9L, 5L,
4L, 6L, 8L, 6L, 3L, 3L, 5L, 3L, 4L, 5L, 3L, 5L, 3L, 9L, 6L,
4L, 4L, 6L, 2L, 5L, 5L, 9L, 3L, 4L, 3L, 5L, 3L, 4L, 1L, 8L,
1L, 5L, 7L, 6L, 9L, 1L, 9L, 1L, 4L, 4L, 2L, 5L, 2L, 3L, 5L,
1L, 3L, 3L, 3L, 2L, 6L, 5L, 2L, 6L, 5L, 2L, 5L, 3L, 6L, 5L,
6L, 3L, 3L, 4L, 7L, 4L, 6L, 1L, 2L, 2L, 4L, 3L, 3L, 3L, 3L,
4L, 4L, 3L, 3L), lai = c(4.525068022, 0.325399379, 0.229222148,
4.076350538, 0.006889889, 0.003279268, 0.037268428, 0.056032134,
0.013573973, 0.001304667, 0.696949844, 1.256477431, 0.122569437,
0.191398415, 1.606070777, 0.425381508, 0.03013251, 0.00181661,
0.017317993, 0.014455456, 0.102704752, 0.031065374, 0.000923601,
0.453384679, 0.017859983, 7.765697214, 0.127071322, 0.102178413,
0.049099766, 0.427983019, 4.22e-05, 0.229034333, 0.694745347,
0.068069112, 0.218354525, 0.05883256, 0.032252145, 0.304812298,
0.009320025, 0.036424481, 0, 0.326, 0.000201724, 0.286106787,
0.556249444, 0.274764132, 4.21, 0, 0.695663959, 0.000213763,
0.00476907, 0.000205017, 3.77e-05, 0.134661951, 0.005631489,
0.0971, 0.172154618, 5.91e-05, 0.000371101, 0.000145266,
0.013382779, 0.00025348, 0.11016712, 0.0616302, 0.018011524,
0.107619537, 0.189926726, 0.000857257, 0.041252452, 0, 0.00475341,
0.077329281, 0.633865958, 0.038182437, 0.015560589, 0.010375148,
1.515423445, 0.008559863, 0.003636564, 0.000424537, 0.002786085,
0.091458876, 0.014216177, 0.165042816, 0.009187705, 0.00115711,
0.000920496, 0.009072635, 0.001443384, 0.001595447, 0.023263507
)), .Names = c("plot.code", "species.code", "plant_growth_form",
"ord1", "lai"), class = "data.frame", row.names = c(NA, -91L))

Ggplot2 geom_line error

I have a daaset which consists of data points over a time series for the proportion of people living in urban/rural areas for a number of countries. Sadly, not all countries have data for the same years. I have been trying to produce a simple line plot to show the different proportions of people living in different locations by year, but as each country has a different number of data points I am running into trouble.
I think this is because some of the countries only have data for a single year and using geom_line from ggplot2 throws the following error:
geom_path: Each group consist of only one observation. Do you need to
adjust the group aesthetic?
I was hoping that there would be some way to override this, or perhaps just plot a single point where a COUNTRY only has data for a single year. Does anyone know if this is possible, or indeed, if this is actually what this error means?!!?
Any help greatly appreciated!!!
Thanks
Here is my data:
structure(list(COUNTRY = structure(c(1L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L,
14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L,
8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L), class = "factor", .Label = c("Comoros",
"Eritrea", "Ethiopia", "Kenya", "Lesotho", "Madagascar", "Malawi",
"Namibia", "Rwanda", "South Africa", "Swaziland", "Tanzania",
"Zambia", "Zimbabwe")), Year = structure(c(5L, 12L, 4L, 25L,
16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L,
9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L,
24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L,
8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L,
22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L,
21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L,
20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L,
22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L,
13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L,
8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L,
5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L,
6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L,
9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L,
5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L,
7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L,
19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L,
1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L,
4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L,
24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L,
7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L,
24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L,
23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L,
1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L,
10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L), class = "factor", .Label = c("1992",
"1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000",
"2000/1", "2001/2", "2002", "2003", "2003/4", "2004", "2005",
"2005/6", "2006", "2006/7", "2007", "2007/8", "2008/9", "2009",
"2010", "2011")), location = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), .Label = c("Urban",
"Rural", "Total", "Capital.City", "Other.Cities.towns", "Urban.Non.slum",
"Urban.Slum"), class = "factor"), percent = c(63.0434782608696,
93.8, 87, 79.5642604795185, 65.4240807416892, 63.0791092522326,
90.448386469558, 85.9419999774024, 92.7603614781794, 84.0437368780105,
89.9792286718626, 91.0916571421351, 87.1132950026762, 73.8624315865239,
60.8311005575454, 66.7, 96, 86.8, 90.6243926153181, 90.6911141749493,
90.7602286016099, 93.0377175475414, 86.073106379954, 84.253722056373,
77.8178199148702, 97.3, 91.8332260789258, 89.612164524266, 89.9070989918367,
94.9, 85.1351949905457, 94.8358752154967, 92.9, 89.656599879838,
90.2634019334124, 94.4, 91.6241263241579, 76.7337303943862, 68.4233513070184,
74.15601627144, 88.4802888646634, 85.4643913454376, 89.7457528950664,
81.3025210084024, 83.0579155525397, 71.5857386620092, 86.2324062094295,
87.687478493975, 63.5379061371841, 78.5, 40.7, 51.7763728811622,
32.2441768813334, 22.3138981723172, 83.3699691175754, 69.6742912391579,
76.0526239692028, 83.7290062290807, 77.4758329101792, 83.8081963934296,
67.5805226154664, 55.8951299980461, 41.9921451192584, 52.2, 92.5,
77.6, 82.0322170392223, 85.2850090044269, 70.8031150919282, 47.108593681531,
82.2215412952297, 78.3643348536815, 74.4253468485616, 94.8, 90.1711142192198,
85.0338348718722, 86.3134329333052, 90.4, 79.2813256726705, 90.7077549957666,
82.5, 77.7236217339155, 75.3278238729086, 77.7, 78.4592126267142,
67.1145693585691, 55.3459024734839, 57.8463881286199, 83.5604620304044,
83.9259722574938, 84.4589780509803, 73.3992444632325, 77.544833952707,
63.0503715222555, 75.6808008503601, 85.6943513045284, 63.4, 84.2,
51, 55.7151220012609, 34.9, 26.6, 85, 72.5, 79.2, 83.8, 80.3,
84.9, 69.6, 59, 46, 54, 93, 78.7, 83.2, 85.9, 76.7, 57.5, 83.8,
80.4, 75.6, 95, 90.4, 85.6, 86.9, 90.6, 82.2, 91.5, 84.5, 79.9,
78.1, 80.9, 81.2, 68.1, 56.8, 59.6, 84.9, 84.4, 86.5, 77, 79.1337842548663,
65.6, 79.1, 86.3, 68.421052631579, 96.1, 93.3, 93.461209969107,
82.2712525836501, 88.2708936990495, 87.6298001816506, 87.6386027991385,
93.1818181818183, 86.6666666666668, 88.1030398041979, 90.4761904761904,
83.4297434324662, 86.3744073211853, 83.6107223166148, 78.3, NA,
72.8, 80.952380952381, 87.5, 96.9073193030442, 99.1348508752745,
85.5297651573129, 86.4793919321843, 79.4520547945208, 98.2, 92.4613307718678,
85.4590408924955, 83.9378238341966, 92.1, 81.1594202898552, 96.0232554251852,
NA, 88.0377726639494, 83.690767555447, 93.4, 90.0349966633017,
71.2508707571865, 72, 79.4082828804656, 91.8032786885246, 84.5238095238095,
87.8787878787881, 75.6097560975609, 81.0643061692494, 68.4708412135189,
84.9056603773584, 89.5522388059702, 61.6438356164384, 91.7, 79.5,
77.0004220956012, 61.061381883032, 58.756042602018, 91.2594694272412,
85.20149612163, 92.4956062313464, 82.622382662868, 91.4036416540165,
91.6169313256523, 89.2957214499669, 67.6757501795213, 48.1479760952102,
NA, NA, 94.2, 94.3553068539161, 91.8799748693178, 89.3739230258784,
92.1418739343887, 86.4757947454868, 81.0102236379536, 77.0100025126874,
NA, 91.3720851411616, 92.2, 92.5003150086683, 97.8260869565219,
87.1461797069698, 93.5168077834096, NA, 90.1780793791367, 92.9758067301415,
94.9, 91.8829499602467, 81.749280834314, 65.1853441661798, 69.0503609949116,
87.2562445664681, 85.8298270239758, 90.6673511683335, 83.2861189801694,
84.9006282245266, 73.65452177457, 87.3075692692965, 85.5310215524833,
83.3333333333333, NA, NA, 98.5990187756088, 84.4640706359058,
NA, 93.9158337759274, 91.5744358611439, 100, NA, NA, NA, 88.7824144772468,
85.1972665683085, 89.54493171236, NA, NA, 89.8, NA, 100, 97.6261376125643,
96.3196943955923, 92.0952338262334, 87.9266080431752, 80.9429968520701,
NA, NA, 92.8, 95.2886158200472, 100, 86.4199793410402, NA, NA,
89.9001648604344, NA, NA, 91.5033109800214, 83.8918470610424,
73.9339911532972, 88.6921281548131, 94.309068022859, 85.3299585067346,
93.7362934447331, 86.5384615384618, 83.7424288707868, NA, 86.3836615391687,
88.1866796344726, 58.1081081081081, NA, NA, 75.7976468146464,
62.1289432084197, NA, 88.1488735873722, 84.2108238885019, 89.8335978405451,
NA, NA, NA, 86.9222656846515, 70.3584041024493, 70.9023609260137,
NA, NA, 85.9, NA, 89.8689917369566, 90.3864925686512, 92.628169473785,
80.9468895007753, 78.7885741638367, 75.4005791241575, NA, NA,
88.4, 87.7139456942162, 92.3809523809525, 83.7645232075473, NA,
NA, 89.567507133125, NA, NA, 91.6433898994358, 73.6225283043976,
65.9223049858496, 72.3148320483822, 86.2596215693035, 85.6224026570651,
87.4940330171337, 78.7499999999997, 81.9949404453665, NA, 84.5563115043796,
87.0190820047277)), .Names = c("COUNTRY", "Year", "location",
"percent"), row.names = c(NA, -336L), class = "data.frame")
I want to produce a simple plot with ggplot2 that is facetted by COUNTRY. I can do this fine using geom_point:
ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_point() + facet_wrap(~COUNTRY)
However, if I try and produce a line plot with geom_line (ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_line() + facet_wrap(~COUNTRY))
I get the following error:
geom_path: Each group consist of only one observation. Do you need to
adjust the group aesthetic?
I had thought that this could be because a couple of the countries have only one year's worth of data so I subsetted the date to remove these three countries like so:
ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_line(data=meas_melt[!meas_melt$COUNTRY %in% c('Comoros','South Africa','Swaziland'),]) + facet_wrap(~COUNTRY)
However, I get the same error!
#Sven's answer is correct but fixes only part of the problem. Note how there's no plot for Comoros, South Africe, or Swaziland. This is because in your data, sometimes year is, e.g., 2006 or 2007, and sometimes it is "2006/7".
data[meas_melt$COUNTRY=="Swaziland",]
COUNTRY Year location percent
32 Swaziland 2006/7 Urban 94.83588
80 Swaziland 2006/7 Rural 90.70775
128 Swaziland 2006/7 Total 91.50000
176 Swaziland 2006/7 Capital.City 96.02326
224 Swaziland 2006/7 Other.Cities.towns 93.51681
272 Swaziland 2006/7 Urban.Non.slum NA
320 Swaziland 2006/7 Urban.Slum NA
Those countries really have only one "year" (hence, no line). More importantly, these odd year designations distort your x-axis. You can see that using the scales="free" argument to facet_wrap(...):
ggplot(meas_melt, aes(x=Year,y=percent, color=location)) +
geom_line(aes(group=location)) +facet_wrap(~COUNTRY, scales="free") +
theme(axis.text.x=element_text(angle=90, vjust=0.5, size=8),
legend.position="bottom")
Which produces this:
You have to specify aes(group = location) inside geom_line:
library(ggplot2)
ggplot(meas_melt, aes(Year, percent, colour=location)) +
geom_line(aes(group = location)) +
facet_wrap(~COUNTRY)

Resources