ggplot2 polygon -> Error: Discrete value supplied to continuous scale - r

I am trying to draw couple of countries and fill them with color based on certain variable, and I've achieved it for one country, however when I try to multiply counties I keep on getting Error: Discrete value supplied to continuous scale although the code is basically the same, so here is the nonworking code and its dataset
library(ggplot2)
library(maptools)
library(rgeos)
library(rgdal)
library(gpclib)
library(dplyr)
mne <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/MNE_adm0.RData'))
sr <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/SRB_adm0.RData'))
sl <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/SVN_adm0.RData'))
hr <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/HRV_adm0.RData'))
bh <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/BIH_adm0.RData'))
mk <- get(load('/Users/praeconium/Desktop/Dev/NonResponse/GoliOtok/Paper/Shapes/MKD_adm0.RData'))
gpclibPermit()
mne.adm0.df <- fortify(mne, region = "NAME_ENGLISH")
sr.adm0.df <- fortify(sr, region = "NAME_ENGLISH")
sl.adm0.df <- fortify(sl, region = "NAME_ENGLISH")
hr.adm0.df <- fortify(hr, region = "NAME_ENGLISH")
bh.adm0.df <- fortify(bh, region = "NAME_ENGLISH")
mk.adm0.df <- fortify(mk, region = "NAME_ENGLISH")
combine <- 0
combine <- rbind(combine, mne.adm0.df)
combine1 <- rbind(combine, sr.adm0.df)
combine2 <- rbind(combine1, sl.adm0.df)
combine3 <- rbind(combine2, hr.adm0.df)
combine4 <- rbind(combine3, bh.adm0.df)
combine5 <- rbind(combine4, mk.adm0.df)
rm(combine,combine1,combine2,combine3, combine4)
combine6 <- combine5[-1,]
States <- data.frame(id=unique(combine6$id), emp=c(1,2,3,1,2,3), long=c("19.09462","19.70397","16.28719","16.40274","16.96187","22.23574"), lat=c("43.52889","46.18837","46.8704","46.43516","45.22893","42.3041"))
Set <- merge(combine6, States, by='id')
p <- ggplot(Set, aes(x = lat.x, y = long.x, group = group)) +
geom_polygon(aes(fill = cut(emp,10))) +
geom_text(data = States, aes(label = id, x = long, y = lat, group = id), size = 3) +
labs(x=" ", y=" ") +
theme_bw() + scale_fill_brewer('Rate', palette = 'PuRd') +
coord_map() +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) +
theme(panel.border = element_blank())
print(p)
session output, and dataset
> str(Set)
'data.frame': 220148 obs. of 10 variables:
$ id : chr "Bosnia and Herzegovina" "Bosnia and Herzegovina" "Bosnia and Herzegovina" "Bosnia and Herzegovina" ...
$ long.x: num 16.9 16.9 16.9 16.9 17 ...
$ lat.x : num 45.2 45.2 45.2 45.2 45.2 ...
$ order : num 1 2 3 4 5 6 7 8 9 10 ...
$ hole : num 0 0 0 0 0 0 0 0 0 0 ...
$ piece : Factor w/ 521 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ group : Factor w/ 531 levels "Montenegro.1",..: 530 530 530 530 530 530 530 530 530 530 ...
$ emp : num 2 2 2 2 2 2 2 2 2 2 ...
$ long.y: Factor w/ 6 levels "16.28719","16.40274",..: 3 3 3 3 3 3 3 3 3 3 ...
$ lat.y : Factor w/ 6 levels "42.3041","43.52889",..: 3 3 3 3 3 3 3 3 3 3 ...
> sessionInfo()
R version 3.0.2 (2013-09-25)
Platform: x86_64-apple-darwin10.8.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] psych_1.3.2 gridExtra_0.9.1 lubridate_1.3.3 mapproj_1.2-1 maps_2.3-6 dplyr_0.1 gpclib_1.5-5 rgdal_0.8-16
[9] rgeos_0.3-4 maptools_0.8-29 sp_1.0-14 ggplot2_0.9.3.1
loaded via a namespace (and not attached):
[1] assertthat_0.1 colorspace_1.2-2 dichromat_2.0-0 digest_0.6.3 foreign_0.8-55 gtable_0.1.2
[7] labeling_0.2 lattice_0.20-23 MASS_7.3-29 memoise_0.1 munsell_0.4.2 plyr_1.8
[13] proto_0.3-10 RColorBrewer_1.0-5 Rcpp_0.11.0 reshape2_1.2.2 scales_0.2.3 stringr_0.6.2
[19] tools_3.0.2
> headTail(Set)
id long.x lat.x order hole piece group emp long.y lat.y
1 Bosnia and Herzegovina 16.93 45.23 1 0 1 Bosnia and Herzegovina.1 2 16.96187 45.22893
2 Bosnia and Herzegovina 16.94 45.23 2 0 1 Bosnia and Herzegovina.1 2 16.96187 45.22893
3 Bosnia and Herzegovina 16.95 45.23 3 0 1 Bosnia and Herzegovina.1 2 16.96187 45.22893
4 Bosnia and Herzegovina 16.95 45.23 4 0 1 Bosnia and Herzegovina.1 2 16.96187 45.22893
... <NA> ... ... ... ... <NA> <NA> ... <NA> <NA>
220145 Slovenia 13.6 45.49 2791 0 2 Slovenia.2 3 16.28719 46.8704
220146 Slovenia 13.6 45.49 2792 0 2 Slovenia.2 3 16.28719 46.8704
220147 Slovenia 13.6 45.5 2793 0 2 Slovenia.2 3 16.28719 46.8704
220148 Slovenia 13.6 45.5 2794 0 2 Slovenia.2 3 16.28719 46.8704
and here is the working code and its output, basically the same, except the files are on different administrative level
library(ggplot2)
library(maptools)
library(rgeos)
library(rgdal)
library(gpclib)
load("/Users/praeconium/Downloads/MNE_adm1.RData")
mne.adm2.spdf <- get("gadm")
gpclibPermit()
mne.adm2.df <- fortify(mne.adm2.spdf, region = "NAME_1")
id <- unique(mne.adm2.df$id)
emp <- c(1,2,3,4,5,6,7,8,9,1,2,3,4,5,6,7,8,9,1,2,3)
unemp <- data.frame(id,emp)
mne2 <- merge(mne.adm2.df, unemp, by='id')
mne.centro <- data.frame(long = coordinates(mne.adm2.spdf)[, 1], lat = coordinates(mne.adm2.spdf)[, 2])
towns <- c("And", "Bar", "Berane", "BP", "BD", "CT", "DG", "HN", "KOl", "KO", "MK", "NK", "PL", "PV", "PLu", "PG", "RO", "SA", "TV", "UL", "ZB")
id <- unique(mne.adm2.df$id)
mne.centro3 <- data.frame(towns, mne.centro, id)
# Plot
p <- ggplot(mne2, aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = cut(emp,5))) +
geom_text(data = mne.centro3, aes(label = towns, x = long, y = lat, group = towns), size = 3) +
labs(x=" ", y=" ") +
theme_bw() + scale_fill_brewer('Rate', palette = 'PuRd') +
coord_map() +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) +
theme(panel.border = element_blank())
print(p)
> str(mne2)
data.frame': 12931 obs. of 8 variables:
$ id : chr "Andrijevica" "Andrijevica" "Andrijevica" "Andrijevica" ...
$ long : num 19.9 19.9 19.9 19.8 19.8 ...
$ lat : num 42.7 42.7 42.7 42.7 42.7 ...
$ order: int 1 2 3 4 5 6 7 8 9 10 ...
$ hole : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ piece: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
$ group: Factor w/ 27 levels "Andrijevica.1",..: 1 1 1 1 1 1 1 1 1 1 ...
$ emp : num 1 1 1 1 1 1 1 1 1 1 ...
> sessionInfo()
R version 3.0.2 (2013-09-25)
Platform: x86_64-apple-darwin10.8.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils
[6] datasets methods base
other attached packages:
[1] psych_1.3.2 gridExtra_0.9.1 lubridate_1.3.3
[4] mapproj_1.2-1 maps_2.3-6 dplyr_0.1
[7] gpclib_1.5-5 rgdal_0.8-16 rgeos_0.3-4
[10] maptools_0.8-29 sp_1.0-14 ggplot2_0.9.3.1
loaded via a namespace (and not attached):
[1] assertthat_0.1 colorspace_1.2-2
[3] dichromat_2.0-0 digest_0.6.3
[5] foreign_0.8-55 gtable_0.1.2
[7] labeling_0.2 lattice_0.20-23
[9] MASS_7.3-29 memoise_0.1
[11] munsell_0.4.2 plyr_1.8
[13] proto_0.3-10 RColorBrewer_1.0-5
[15] Rcpp_0.11.0 reshape2_1.2.2
[17] scales_0.2.3 stringr_0.6.2
[19] tools_3.0.2
> headTail(mne2)
id long lat order hole piece group emp
1 Andrijevica 19.89 42.68 1 FALSE 1 Andrijevica.1 1
2 Andrijevica 19.88 42.67 2 FALSE 1 Andrijevica.1 1
3 Andrijevica 19.85 42.66 3 FALSE 1 Andrijevica.1 1
4 Andrijevica 19.84 42.66 4 FALSE 1 Andrijevica.1 1
... <NA> ... ... ... <NA> <NA> <NA> ...
12928 Žabljak 19.02 43.25 12928 FALSE 1 Žabljak.1 3
12929 Žabljak 19.02 43.26 12929 FALSE 1 Žabljak.1 3
12930 Žabljak 19.03 43.26 12930 FALSE 1 Žabljak.1 3
12931 Žabljak 19.03 43.26 12931 FALSE 1 Žabljak.1 3
I've followed these questions in order to resolve but I still don't get what am I doing wrong?
Administrative regions map of a country with ggmap and ggplot2
I've checked these and these
Specifying the colour scale for maps in ggplot
Add color to boxplot - "Continuous value supplied to discrete scale" error

I partially resolved goal but not task at hand by excluding
geom_text(data = mne.centro3, aes(label = towns, x = long, y = lat, group = towns), size = 3) +
but I don't understand how the same code is continuos here and discrete there :S

In my own limited experience, I have found I typically need to factorize my grouping conditions. If your data is being interpreted as continuous you can force it to be discrete by factorizing.
The variables I am most concerned about are mne2$emp and mne2$id, especially the latter as in this case the variable towns will most likely be treated as discrete while mne2$id will most likely be treated as continuous. Something to the effect of group = factor(mne2$id) could solve your problem.

Related

Wrong data type conversion after melt

I get a numeric to integer64 type conversion after melting a data.table object in R.
Given the file stats.txt, tab separated:
id x y
A 283726709252 0.1
B 288604342155 0.2
C 329048184196 0.3
D 192107948937 0.4
I want to read it into a data.table and melt it. So:
library(data.table)
stats<- fread('stats.txt')
stats
id x y
1: A 283726709252 0.1
2: B 288604342155 0.2
3: C 329048184196 0.3
4: D 192107948937 0.4
str(stats)
Classes ‘data.table’ and 'data.frame': 4 obs. of 3 variables:
$ id: chr "A" "B" "C" "D"
$ x :integer64 283726709252 288604342155 329048184196 192107948937
$ y : num 0.1 0.2 0.3 0.4
- attr(*, ".internal.selfref")=<externalptr>
So far so good. Now if I melt it, I get the y variable converted from numeric to integer64:
xm<- melt.data.table(data= stats, id.vars= 'id')
xm
id variable value
1: A x 283726709252
2: B x 288604342155
3: C x 329048184196
4: D x 192107948937
5: A y 4591870180066957722
6: B y 4596373779694328218
7: C y 4599075939470750515
8: D y 4600877379321698714
str(xm)
Classes ‘data.table’ and 'data.frame': 8 obs. of 3 variables:
$ id : chr "A" "B" "C" "D" ...
$ variable: Factor w/ 2 levels "x","y": 1 1 1 1 2 2 2 2
$ value :integer64 283726709252 288604342155 329048184196 192107948937 4591870180066957722 4596373779694328218 4599075939470750515 4600877379321698714
- attr(*, ".internal.selfref")=<externalptr>
Is this a bug or am I doing something wrong?
sessionInfo()
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_2.2.1 data.table_1.10.4-3
loaded via a namespace (and not attached):
[1] compiler_3.4.1 colorspace_1.3-2 scales_0.5.0 lazyeval_0.2.0 plyr_1.8.4 gtable_0.2.0 tibble_1.3.3 Rcpp_0.12.12 grid_3.4.1 rlang_0.1.1 munsell_0.4.3

ggplot place facet between two rows of facets

I have 9 plots with 3 time series in each plot, one of these plots contains only one curve and it's the reference plot which I would like to place in between the two rows that contain the other 8 plots. Is there an easy way to do so?
I use facet_wrap(~density,nrow=2) but I get one row with 5 and another with 4 plots. I am sure other people had this problem, is there an easy way around to organize the position of this reference plot, or do I have to create two separate plots and overlay them? Otherwise I might have to move this reference plot in all the other plots but it seems redundant information.
This is my current result, but as you can see it's not very well laid out.
The graphic you are looking for can be generated with gridArrange from the
gridExtra package. Here is
an example using the storms data set from the
dplyr.
library(ggplot2)
library(gridExtra)
library(dplyr)
data(storms, package = 'dplyr')
str(storms)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10010 obs. of 13 variables:
## $ name : chr "Amy" "Amy" "Amy" "Amy" ...
## $ year : num 1975 1975 1975 1975 1975 ...
## $ month : num 6 6 6 6 6 6 6 6 6 6 ...
## $ day : int 27 27 27 27 28 28 28 28 29 29 ...
## $ hour : num 0 6 12 18 0 6 12 18 0 6 ...
## $ lat : num 27.5 28.5 29.5 30.5 31.5 32.4 33.3 34 34.4 34 ...
## $ long : num -79 -79 -79 -79 -78.8 -78.7 -78 -77 -75.8 -74.8 ...
## $ status : chr "tropical depression" "tropical depression" "tropical depression" "tropical depression" ...
## $ category : Ord.factor w/ 7 levels "-1"<"0"<"1"<"2"<..: 1 1 1 1 1 1 1 1 2 2 ...
## $ wind : int 25 25 25 25 25 25 25 30 35 40 ...
## $ pressure : int 1013 1013 1013 1013 1012 1012 1011 1006 1004 1002 ...
## $ ts_diameter: num NA NA NA NA NA NA NA NA NA NA ...
## $ hu_diameter: num NA NA NA NA NA NA NA NA NA NA ...
Let's create two graphics. The first graphic will be only form category == -1
storms (this would be the control group in your question). The second
graphic will be a facteted graphic for the category > -1 storm
First, we'll build a generic ggplot object for the graphics.
graphic <-
ggplot() +
aes(x = long, y = lat, color = category) +
geom_point() +
facet_wrap( ~ category) +
scale_color_hue(breaks = levels(storms$category),
labels = levels(storms$category),
drop = FALSE)
Next we build the two graphics as needed.
g1 <- graphic %+% dplyr::filter(storms, category == -1) + theme(legend.position = "none")
g2 <- graphic %+% dplyr::filter(storms, category != -1)
gridExtra::grid.arrange can take a layout matrix where the numbers 1 and 2
denote the first and second graphics passed to the function. (This works for
a lot more than just two graphics, by the way.) By repeating the values of 1
and 2 in the matrix we can control the relative size of the two graphics in
the graphics device.
gridExtra::grid.arrange(g1, g2,
layout_matrix =
matrix(c(1, 1, 1, 2, 2, 2, 2, 2,
1, 1, 1, 2, 2, 2, 2, 2,
1, 1, 1, 2, 2, 2, 2, 2),
byrow = TRUE, nrow = 3)
)
If I understand the question correctly you could reformat your data with appropriate facetting variables to introduce a new row of reference panels
library(ggplot2)
d <- data.frame(x=rep(1:10, 8), y = rnorm(80),
f=gl(8,10, ordered = TRUE))
d$f1 <- factor(d$f <= 4, labels=c(1,3))
d$f2 <- as.numeric(d$f) %% 4
d2 <- data.frame(x=1:10, y=0, f1 = 2)
ggplot(d, aes(x,y)) +
geom_point(aes(colour=f)) +
geom_point(data=d2, colour="black") +
facet_grid(f1~f2)

as_tibble() not working as expected

I am attempting the exercise in R for data science (7.5.2.1, #2): Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?
First, transmute columns.
library(nycflights13)
foo <- nycflights13::flights %>%
transmute(tot_delay = dep_delay + arr_delay, m = month, d = dest) %>%
filter(!is.na(tot_delay)) %>%
group_by(m, d) %>%
summarise(avg_delay = mean(tot_delay))
Now foo appears to be a data frame based on the 'Source' output.
> foo
Source: local data frame [1,112 x 3]
Groups: m [?]
m d avg_delay
<int> <chr> <dbl>
1 1 ALB 76.571429
2 1 ATL 8.567982
3 1 AUS 19.017751
4 1 AVL 49.000000
5 1 BDL 32.081081
6 1 BHM 47.043478
7 1 BNA 25.930233
8 1 BOS 2.698517
9 1 BQN 8.516129
10 1 BTV 18.393665
# ... with 1,102 more rows
It doesn't appear that as_tibble is working, what could I be doing wrong?
> as_tibble(foo)
Source: local data frame [1,112 x 3]
Groups: m [?]
m d avg_delay
<int> <chr> <dbl>
1 1 ALB 76.571429
2 1 ATL 8.567982
3 1 AUS 19.017751
4 1 AVL 49.000000
5 1 BDL 32.081081
6 1 BHM 47.043478
7 1 BNA 25.930233
8 1 BOS 2.698517
9 1 BQN 8.516129
10 1 BTV 18.393665
# ... with 1,102 more rows
Shouldn't the internals be different for a tibble?
> str(foo)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 1112 obs. of 3 variables:
$ m : int 1 1 1 1 1 1 1 1 1 1 ...
$ d : chr "ALB" "ATL" "AUS" "AVL" ...
$ avg_delay: num 76.57 8.57 19.02 49 32.08 ...
- attr(*, "vars")=List of 1
..$ : symbol m
- attr(*, "drop")= logi TRUE
> str(as_tibble(foo))
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 1112 obs. of 3 variables:
$ m : int 1 1 1 1 1 1 1 1 1 1 ...
$ d : chr "ALB" "ATL" "AUS" "AVL" ...
$ avg_delay: num 76.57 8.57 19.02 49 32.08 ...
- attr(*, "vars")=List of 1
..$ : symbol m
- attr(*, "drop")= logi TRUE
Note that as_tibble() works as expected
> packageDescription("tibble")
Package: tibble
Encoding: UTF-8
Version: 1.3.0
> is_tibble(foo)
[1] TRUE
Works for me - foo is a "tibble" and is announced as "A tibble: 112 x 3" in the print:
> foo
Source: local data frame [1,112 x 3]
Groups: m [?]
# A tibble: 1,112 x 3
m d avg_delay
<int> <chr> <dbl>
1 1 ALB 76.571429
2 1 ATL 8.567982
So you possibly have an old version of dplyr. Mine is:
> packageDescription("dplyr")
Package: dplyr
Type: Package
Version: 0.5.0
And everything else:
> sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 14.04.4 LTS
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] dplyr_0.5.0 tibble_1.3.1
loaded via a namespace (and not attached):
[1] magrittr_1.5 R6_2.2.0 assertthat_0.2.0 DBI_0.5-1
[5] tools_3.3.1 Rcpp_0.12.11 rlang_0.1.1

Unable to pipe predict() output through filter() to ggplot()

I'm struggling to figure out why I can't use filter() on the results
of predict.gam() and then ggplot() the subset of predictions. I'm not
sure the prediction step is really part of the problem, but that's what
it takes to trigger the error. Just filter() %>% ggplot() with a
dataframe works fine.
library(dplyr)
library(ggplot2)
library(mgcv)
gam1 <- gam(Petal.Length~s(Petal.Width) + Species, data=iris)
nd <- expand.grid(Petal.Width = seq(0,5,0.05),
Species = levels(iris$Species),
stringsAsFactors = FALSE)
predicted <- predict(gam1,newdata=nd)
predicted <- cbind(predicted,nd)
filter(tbl_df(predicted), Species == "setosa") %>%
ggplot(aes(x=Petal.Width, y = predicted)) +
geom_point()
## Error: length(rows) == 1 is not TRUE
But:
filter(tbl_df(predicted), Species == "setosa")
## Source: local data frame [101 x 3]
##
## predicted Petal.Width Species
## (dbl[10]) (dbl) (chr)
## 1 1.294574 0.00 setosa
## 2 1.327482 0.05 setosa
## 3 1.360390 0.10 setosa
## 4 1.393365 0.15 setosa
## 5 1.426735 0.20 setosa
## 6 1.460927 0.25 setosa
## 7 1.496477 0.30 setosa
## 8 1.533949 0.35 setosa
## 9 1.573888 0.40 setosa
## 10 1.616810 0.45 setosa
## .. ... ... ...
And the problem is filter() because:
pick <- predicted$Species == "setosa"
ggplot(predicted[pick,],aes(x=Petal.Width, y = predicted)) +
geom_point()
I've also tried saving the result of filter to an object and using that directly in ggplot() but that has the same error.
Obviously not a crisis, because there's a workaround, but my mental
model of how to use filter() is obviously wrong! Any insights much
appreciated.
Edit: When I first posted this I was still using R 3.2.3 and was getting warnings from ggplot2 and dplyr. So I upgraded to 3.3.0 and it's still happening.
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 10586)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] mgcv_1.8-12 nlme_3.1-127 ggplot2_2.1.0 dplyr_0.4.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.3 knitr_1.11 magrittr_1.5 munsell_0.4.2
## [5] colorspace_1.2-6 lattice_0.20-33 R6_2.1.1 stringr_1.0.0
## [9] plyr_1.8.3 tools_3.3.0 parallel_3.3.0 grid_3.3.0
## [13] gtable_0.1.2 DBI_0.3.1 htmltools_0.2.6 lazyeval_0.1.10
## [17] yaml_2.1.13 assertthat_0.1 digest_0.6.8 Matrix_1.2-6
## [21] formatR_1.2 evaluate_0.7.2 rmarkdown_0.9.5 labeling_0.3
## [25] stringi_1.0-1 scales_0.3.0
The problem arises because your predict() call generates a named array, instead of just a numerical vector.
class(predicted$predicted)
# [1] "array"
The first filter() will give you the correct output on the surface, however if you inspect the output you will notice that the column predicted is still some sort of nested array.
str(filter(tbl_df(predicted), Species == "setosa"))
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 101 obs. of 3 variables:
$ predicted : num [1:303(1d)] 1.29 1.33 1.36 1.39 1.43 ...
..- attr(*, "dimnames")=List of 1
.. ..$ : chr "1" "2" "3" "4" ...
$ Petal.Width: num 0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 ...
$ Species: chr "setosa" "setosa" "setosa" "setosa" ...
In contrast, good old logical subsetting does the job on all dimensions:
str(predicted[pick,])
'data.frame': 101 obs. of 3 variables:
$ predicted : num [1:101(1d)] 1.29 1.33 1.36 1.39 1.43 ... # Now 101 obs here too
..- attr(*, "dimnames")=List of 1
.. ..$ : chr "1" "2" "3" "4" ...
$ Petal.Width: num 0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 ...
$ Species : chr "setosa" "setosa" "setosa" "setosa" ...
So either you coerce the predicted column to numeric:
library(dplyr)
library(ggplot2)
predicted %>% mutate(predicted = as.numeric(predicted)) %>%
filter(Species == "setosa") %>%
ggplot(aes(x = Petal.Width, y = predicted)) +
geom_point()
Or replace filter() by subset():
predicted %>%
subset(Species == "setosa") %>%
ggplot(aes(x = Petal.Width, y = predicted)) +
geom_point()

ddply error: Error in attributes(out) <- attributes(col) : 'names' attribute must be the same length as the vector

I am trying to apply ddply on a large data.frame (38000 rows / 10 variables), but I am stuck with an error:
ddply(uncertainty.long, .(Species), "nrow")
returns the error:
Error in attributes(out) <- attributes(col) :
'names' attribute [38000] must be the same length as the vector [3800]
> traceback()
11: FUN(1:10[[5L]], ...)
10: lapply(seq_len(n), extract_col_rows, df = x, i = i)
9: extract_rows(x$data, x$index[[i]])
8: `[[.indexed_df`(pieces, i)
7: pieces[[i]]
6: (function (i)
{
piece <- pieces[[i]]
if (.inform) {
res <- try(.fun(piece, ...))
if (inherits(res, "try-error")) {
piece <- paste(capture.output(print(piece)), collapse = "\n")
stop("with piece ", i, ": \n", piece, call. = FALSE)
}
}
else {
res <- .fun(piece, ...)
}
progress$step()
res
})(1L)
5: .Call("loop_apply", as.integer(n), f, env)
4: loop_apply(n, do.ply)
3: llply(.data = .data, .fun = .fun, ..., .progress = .progress,
.inform = .inform, .parallel = .parallel, .paropts = .paropts)
2: ldply(.data = pieces, .fun = .fun, ..., .progress = .progress,
.inform = .inform, .parallel = .parallel, .paropts = .paropts)
1: ddply(uncertainty.long, .(Species), "nrow")
Some more details about my data.frame:
> head(uncertainty.long)
Stack Variable PARun Model Species value year scenario GCM sp
1 sync_current Total PA1 GLM Arctosafulvolineata 100.0000 NA <NA> <NA> Arctosa\nfulvolineata
2 sync_cgcm2_B2A_2020 Total PA1 GLM Arctosafulvolineata 134.6840 2020 B2A cgcm2 Arctosa\nfulvolineata
3 sync_cgcm2_B2A_2050 Total PA1 GLM Arctosafulvolineata 153.7617 2050 B2A cgcm2 Arctosa\nfulvolineata
4 sync_cgcm2_B2A_2080 Total PA1 GLM Arctosafulvolineata 195.7176 2080 B2A cgcm2 Arctosa\nfulvolineata
5 sync_mk2_B2A_2020 Total PA1 GLM Arctosafulvolineata 172.2967 2020 B2A mk2 Arctosa\nfulvolineata
6 sync_mk2_B2A_2050 Total PA1 GLM Arctosafulvolineata 198.9391 2050 B2A mk2 Arctosa\nfulvolineata
> str(uncertainty.long)
'data.frame': 38000 obs. of 10 variables:
$ Stack : Factor w/ 19 levels "sync_cgcm2_B2A_2020",..: 7 1 2 3 14 15 16 11 12 13 ...
$ Variable: Factor w/ 5 levels "Lost","NetChange",..: 5 5 5 5 5 5 5 5 5 5 ...
$ PARun : Factor w/ 5 levels "PA1","PA2","PA3",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Model : Factor w/ 8 levels "CTA","FDA","GAM",..: 5 5 5 5 5 5 5 5 5 5 ...
$ Species : Factor w/ 10 levels "Arctosafulvolineata",..: 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr "1" "1" "1" "1" ...
$ value : num 100 135 154 196 172 ...
$ year : num NA 2020 2050 2080 2020 2050 2080 2020 2050 2080 ...
$ scenario: chr NA "B2A" "B2A" "B2A" ...
$ GCM : chr NA "cgcm2" "cgcm2" "cgcm2" ...
$ sp : chr "Arctosa\nfulvolineata" "Arctosa\nfulvolineata" "Arctosa\nfulvolineata" "Arctosa\nfulvolineata" ...
This is my sessionInfo():
> sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale:
[1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252 LC_MONETARY=French_France.1252 LC_NUMERIC=C LC_TIME=French_France.1252
attached base packages:
[1] parallel splines grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape2_1.2.2 Hmisc_3.12-2 Formula_1.1-1 RCurl_1.95-4.1 bitops_1.0-6 biomod2_3.0.3 pROC_1.5.4 plyr_1.8
[9] rpart_4.1-3 randomForest_4.6-7 mda_0.4-4 class_7.3-9 gbm_2.1 survival_2.37-4 nnet_7.3-7 rasterVis_0.21
[17] hexbin_1.26.2 latticeExtra_0.6-26 RColorBrewer_1.0-5 lattice_0.20-23 abind_1.4-0 raster_2.1-49 sp_1.0-13 ggplot2_0.9.3.1
loaded via a namespace (and not attached):
[1] cluster_1.14.4 colorspace_1.2-2 dichromat_2.0-0 digest_0.6.3 gtable_0.1.2 labeling_0.2 MASS_7.3-29 munsell_0.4.2 proto_0.3-10 scales_0.2.3
[11] stringr_0.6.2 tools_3.0.1 zoo_1.7-10
I have tried to reproduce it with a fewer number of columns (2 columns), it did not change anything.
However, if I reduce the number of lines, it can work when the requested variable "Species" has only one level value:
> small.df <- uncertainty.long[1:3800, ]
> unique(small.df$Species)
[1] Arctosafulvolineata
10 Levels: Arctosafulvolineata Argyronetaaquatica Dolomedesplantarius Enoplognathamordax Iciussubinermis Neonvalentulus Pardosabifasciata Pardosaoreophila ... Trochosaspinipalpis
> ddply(small.df, .(Species), "nrow")
Species nrow
1 Arctosafulvolineata 3800
But if I had another line:
> small.df <- uncertainty.long[1:3801, ]
> unique(small.df$Species)
[1] Arctosafulvolineata Argyronetaaquatica
10 Levels: Arctosafulvolineata Argyronetaaquatica Dolomedesplantarius Enoplognathamordax Iciussubinermis Neonvalentulus Pardosabifasciata Pardosaoreophila ... Trochosaspinipalpis
> small.df[3800:3801, ]
Stack Variable PARun Model Species value year scenario GCM sp
3800 sync_hadcm3_A1B_2080 Lost PA5 MAXENT Arctosafulvolineata -54.90872 2080 A1B hadcm3 Arctosa\nfulvolineata
3801 sync_current Total PA1 GLM Argyronetaaquatica 100.00000 NA <NA> <NA> Argyroneta\naquatica
> ddply(small.df, .(Species), "nrow")
Error in attributes(out) <- attributes(col) :
'names' attribute [3801] must be the same length as the vector [3800]
I have found others with a similar problem : https://stackoverflow.com/a/14162351/2788395.
However, their workaround (reinstalling plyr 1.7 instead of 1.8) did not work for me.
Does anyone have an idea of the problem and/or how to solve it?
Thanks!
Problem solved
The issue was with the "names" attribute of the "Species" column.
I removed them with the following code and ddply worked:
> names(uncertainty.long$Species) <- "NULL"
> ddply(uncertainty.long, .(Species), "nrow")
Species nrow
1 Arctosafulvolineata 3800
2 Argyronetaaquatica 3800
3 Dolomedesplantarius 3800
4 Enoplognathamordax 3800
5 Iciussubinermis 3800
6 Neonvalentulus 3800
7 Pardosabifasciata 3800
8 Pardosaoreophila 3800
9 Piratauliginosus 3800
10 Trochosaspinipalpis 3800
The issue was with the "names" attribute of the "Species" column:
$ Species : Factor w/ 10 levels "Arctosafulvolineata",..: 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr "1" "1" "1" "1" ...
I removed them with the following code and ddply worked:
> names(uncertainty.long$Species) <- "NULL"
> ddply(uncertainty.long, .(Species), "nrow")
Species nrow
1 Arctosafulvolineata 3800
2 Argyronetaaquatica 3800
3 Dolomedesplantarius 3800
4 Enoplognathamordax 3800
5 Iciussubinermis 3800
6 Neonvalentulus 3800
7 Pardosabifasciata 3800
8 Pardosaoreophila 3800
9 Piratauliginosus 3800
10 Trochosaspinipalpis 3800

Resources