Nothing rendering when using stat_density_2d(geom = "polygon") - r

For some reason stat_density_2d() does not seem to be working right for me when specifying geom = "polygon" and I am absolutely stumped. Here is my code...
library(sf)
library(tidyverse)
library(RANN2)
library(hexbin)
library(mapproj)
options(stringsAsFactors = FALSE)
raleigh_police <- rgdal::readOGR("https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson", "OGRGeoJSON")
raleigh_police_sf <- raleigh_police %>%
st_as_sf()
raleigh_police_sf %>%
filter(crime_description == "Burglary/Residential") %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
stat_density_2d(aes(X, Y, fill = stat(level)), geom = "polygon")
Here is my sessionInfo()...
R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS 10.14.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 mapproj_1.2.6 maps_3.3.0 hexbin_1.27.2 RANN2_0.1 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
[9] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1 sf_0.7-1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.0 lubridate_1.7.4 lattice_0.20-38 class_7.3-14 utf8_1.1.4 assertthat_0.2.0 rprojroot_1.3-2
[8] digest_0.6.18 R6_2.3.0 cellranger_1.1.0 plyr_1.8.4 backports_1.1.2 evaluate_0.12 e1071_1.7-0
[15] httr_1.3.1 blogdown_0.9 pillar_1.3.0 rlang_0.3.0.1 lazyeval_0.2.1 readxl_1.1.0 rstudioapi_0.8
[22] rmarkdown_1.10 labeling_0.3 rgdal_1.3-6 munsell_0.5.0 broom_0.5.0 compiler_3.5.1 modelr_0.1.2
[29] xfun_0.4 pkgconfig_2.0.2 htmltools_0.3.6 tidyselect_0.2.5 bookdown_0.7 codetools_0.2-15 fansi_0.4.0
[36] crayon_1.3.4 withr_2.1.2 MASS_7.3-51.1 grid_3.5.1 nlme_3.1-137 spData_0.2.9.4 jsonlite_1.5
[43] gtable_0.2.0 DBI_1.0.0 magrittr_1.5 units_0.6-1 scales_1.0.0 cli_1.0.1 stringi_1.2.4
[50] sp_1.3-1 xml2_1.2.0 tools_3.5.1 glue_1.3.0 hms_0.4.2 yaml_2.2.0 colorspace_1.3-2
[57] classInt_0.2-3 rvest_0.3.2 knitr_1.20 bindr_0.1.1 haven_1.1.2
I just get a blank plot with nothing on it. Completely stumped. What am I doing wrong here?
Update (2018-11-18)
It turns out that the primary issue here was options(stringsAsFactors = FALSE). If you comment that out and run the original code, everything actually works fine. I found this GitHub Issue which was the reason I tried that. Much more efficient code solutions are provided in the answers to this question and they also made sure not to use options(stringsAsFactors = FALSE).

Aside from downloading the file before reading and changing readOGR to read_sf, it works as-is for me save warnings for a couple NA points caused by empty geometries:
library(tidyverse)
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
path <- "~/Downloads/raleigh.geojson"
download.file(
"https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson",
path,
method = "curl"
)
raleigh_police <- sf::read_sf(path, "OGRGeoJSON")
raleigh_police %>%
filter(crime_description == "Burglary/Residential") %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
stat_density_2d(aes(X, Y, fill = stat(level)), geom = "polygon")
#> Warning: Removed 5 rows containing non-finite values (stat_density2d).
The empty rows:
raleigh_police %>%
filter(crime_description == "Burglary/Residential",
st_is_empty(.))
#> Simple feature collection with 5 features and 21 fields (with 5 geometries empty)
#> geometry type: POINT
#> dimension: XY
#> bbox: xmin: NA ymin: NA xmax: NA ymax: NA
#> epsg (SRID): 4326
#> proj4string: +proj=longlat +datum=WGS84 +no_defs
#> # A tibble: 5 x 22
#> OBJECTID GlobalID case_number crime_category crime_code crime_descripti…
#> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 205318 8057315… P14076062 BURGLARY/RESI… 30B Burglary/Reside…
#> 2 417488 70afb27… P15027702 BURGLARY/RESI… 30B Burglary/Reside…
#> 3 424718 bdf69fa… P18029113 BURGLARY/RESI… 30B Burglary/Reside…
#> 4 436550 711c05b… P18044139 BURGLARY/RESI… 30B Burglary/Reside…
#> 5 442091 9d7a008… P18051764 BURGLARY/RESI… 30B Burglary/Reside…
#> # … with 16 more variables: crime_type <chr>, reported_block_address <chr>,
#> # city_of_incident <chr>, city <chr>, district <chr>, reported_date <dttm>,
#> # reported_year <int>, reported_month <int>, reported_day <int>,
#> # reported_hour <int>, reported_dayofwk <chr>, latitude <dbl>,
#> # longitude <dbl>, agency <chr>, updated_date <dttm>, geometry <POINT [°]>
sessionInfo()
#> R version 3.5.1 (2018-07-02)
#> Platform: x86_64-apple-darwin15.6.0 (64-bit)
#> Running under: macOS 10.14.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] sf_0.7-1 forcats_0.3.0 stringr_1.3.1
#> [4] dplyr_0.7.99.9000 purrr_0.2.5 readr_1.2.0
#> [7] tidyr_0.8.2 tibble_1.4.99.9005 ggplot2_3.1.0
#> [10] tidyverse_1.2.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_0.2.5 haven_1.1.2 lattice_0.20-35
#> [4] colorspace_1.3-2 htmltools_0.3.6 yaml_2.2.0
#> [7] rlang_0.3.0.1 e1071_1.7-0 pillar_1.3.0.9001
#> [10] glue_1.3.0 withr_2.1.2 DBI_1.0.0
#> [13] modelr_0.1.2 readxl_1.1.0 plyr_1.8.4
#> [16] munsell_0.5.0 gtable_0.2.0 cellranger_1.1.0
#> [19] rvest_0.3.2 evaluate_0.12 labeling_0.3
#> [22] knitr_1.20 class_7.3-14 broom_0.5.0
#> [25] Rcpp_0.12.19.3 scales_1.0.0 backports_1.1.2
#> [28] classInt_0.2-3 jsonlite_1.5 hms_0.4.2.9001
#> [31] digest_0.6.18 stringi_1.2.4 grid_3.5.1
#> [34] rprojroot_1.3-2 cli_1.0.1 tools_3.5.1
#> [37] magrittr_1.5 lazyeval_0.2.1 crayon_1.3.4
#> [40] pkgconfig_2.0.2 MASS_7.3-51 xml2_1.2.0
#> [43] spData_0.2.9.4 lubridate_1.7.4 assertthat_0.2.0
#> [46] rmarkdown_1.10 httr_1.3.1 R6_2.3.0
#> [49] units_0.6-1 nlme_3.1-137 compiler_3.5.1

As noted, this is just point-level data and the CSV they provide can be a fine substitute:
library(tidyverse)
rp_csv_url <- "https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.csv"
httr::GET(
url = rp_csv_url,
httr::write_disk(basename(rp_csv_url)), # won't overwrite if it exists unless explicitly told to so you get caching for free
httr::progress() # I suspect this is a big file so it's nice to see a progress bar
)
raleigh_police <- read_csv(basename(rp_csv_url))
mutate(
raleigh_police,
longitude = as.numeric(longitude), # they come in wonky, still
latitude = as.numeric(latitude) # they come in wonky, still
) -> raleigh_police
raleigh_police %>%
filter(crime_description == "Burglary/Residential") %>%
ggplot() +
stat_density_2d(
aes(longitude, latitude, fill = stat(level)),
color = "#2b2b2b", size=0.125, geom = "polygon"
) +
viridis::scale_fill_viridis(direction=-1, option="magma") +
hrbrthemes::theme_ipsum_rc()
If you'd like to turn level into something more meaningful:
h <- c(MASS::bandwidth.nrd(rp_br$longitude),
MASS::bandwidth.nrd(rp_br$latitude))
dens <- MASS::kde2d(
rp_br$longitude, rp_br$latitude, h = h, n = 100
)
breaks <- pretty(range(dens$z), 10)
zdf <- data.frame(expand.grid(x = dens$x, y = dens$y), z = as.vector(dens$z))
z <- tapply(zdf$z, zdf[c("x", "y")], identity)
cl <- grDevices::contourLines(
x = sort(unique(dens$x)), y = sort(unique(dens$y)), z = dens$z,
levels = breaks
)
sp::SpatialPolygons(
lapply(1:length(cl), function(idx) {
sp::Polygons(
srl = list(sp::Polygon(
matrix(c(cl[[idx]]$x, cl[[idx]]$y), nrow=length(cl[[idx]]$x), byrow=FALSE)
)),
ID = idx
)
})
) -> cont
sp::coordinates(rp_br) <- ~longitude+latitude
then:
data_frame(
ct = sapply(sp::over(cont, sp::geometry(rp_br), returnList = TRUE), length),
id = 1:length(ct),
lvl = sapply(cl, function(x) x$level)
) %>%
count(lvl, wt=ct) %>%
mutate(
pct = n/nrow(rp_br),
pct_lab = sprintf("%s of the points fall within this level", scales::percent(pct))
)
## # A tibble: 10 x 4
## lvl n pct pct_lab
## <dbl> <int> <dbl> <chr>
## 1 10. 7302 0.927 92.7% of the points fall within this level
## 2 20. 6243 0.792 79.2% of the points fall within this level
## 3 30. 4786 0.607 60.7% of the points fall within this level
## 4 40. 3204 0.407 40.7% of the points fall within this level
## 5 50. 1945 0.247 24.7% of the points fall within this level
## 6 60. 1277 0.162 16.2% of the points fall within this level
## 7 70. 793 0.101 10.1% of the points fall within this level
## 8 80. 474 0.0601 6.0% of the points fall within this level
## 9 90. 279 0.0354 3.5% of the points fall within this level
## 10 100. 44 0.00558 0.6% of the points fall within this level

Related

R+ggplot2: adding log tick marks to a histogram

Please have a look at the reprex at the end of the post.
I generate some lognormally distributed values and then I bin the distribution using a non-uniform bin (the grid is evenly spaced if I take its logarithm).
The point is not the maths, but the fact that, using annotation_logticks
( see
https://ggplot2.tidyverse.org/reference/annotation_logticks.html
) I cannot add the ticks to the plot.
Does anybody understand what goes wrong?
Thanks a lot!
library(tidyverse)
library(scales)
#>
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#>
#> discard
#> The following object is masked from 'package:readr':
#>
#> col_factor
## auxiliary functions
scale_x_log10nice <- function(name=NULL,omag=seq(-20,20),...) {
breaks10 <- 10^omag
scale_x_log10(name,breaks=breaks10,
labels=scales::trans_format("log10", scales::math_format(10^.x)),...)
}
log_binning <- function(x_min,x_max,n_bin){
x_max <- x_max
m <- n_bin-1
r <- (x_max/x_min)^(1/m)
my_seq <- seq(0,m,by=1)
grid <- x_min*r^my_seq
}
##################################################à
set.seed(1234)
n_bins <- 10
df <- tibble(x=rlnorm(10e4, sdlog=2))
my_breaks2 <- log_binning(min(df$x),
max(df$x), n_bins)
gpl <- ggplot(df, aes(x=x )) +
theme_bw()+
geom_histogram(## binwidth=10e3,
colour="black", fill="blue"## , boundary=0
, breaks=my_breaks2
)+
scale_x_log10nice("x values")
gpl
gpl2 <- gpl+
annotation_logticks(sides="b", outside=T)
## where are the logticks?
gpl2
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] scales_1.2.1 forcats_0.5.2 stringr_1.5.0 dplyr_1.0.99.9000
#> [5] purrr_1.0.0 readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
#> [9] ggplot2_3.4.0 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.2.0 xfun_0.36 haven_2.5.1
#> [4] gargle_1.2.1 colorspace_2.0-3 vctrs_0.5.1
#> [7] generics_0.1.3 htmltools_0.5.4 yaml_2.3.6
#> [10] utf8_1.2.2 rlang_1.0.6 pillar_1.8.1
#> [13] glue_1.6.2 withr_2.5.0 DBI_1.1.3
#> [16] dbplyr_2.2.1 readxl_1.4.1 modelr_0.1.10
#> [19] lifecycle_1.0.3 munsell_0.5.0 gtable_0.3.1
#> [22] cellranger_1.1.0 rvest_1.0.3 evaluate_0.19
#> [25] labeling_0.4.2 knitr_1.41 tzdb_0.3.0
#> [28] fastmap_1.1.0 fansi_1.0.3 highr_0.10
#> [31] broom_1.0.2 backports_1.4.1 googlesheets4_1.0.1
#> [34] jsonlite_1.8.4 farver_2.1.1 fs_1.5.2
#> [37] hms_1.1.2 digest_0.6.31 stringi_1.7.8
#> [40] grid_4.2.2 cli_3.6.0 tools_4.2.2
#> [43] magrittr_2.0.3 crayon_1.5.2 pkgconfig_2.0.3
#> [46] ellipsis_0.3.2 xml2_1.3.3 reprex_2.0.2
#> [49] googledrive_2.0.0 lubridate_1.9.0 timechange_0.1.1
#> [52] assertthat_0.2.1 rmarkdown_2.19 httr_1.4.4
#> [55] R6_2.5.1 compiler_4.2.2
Created on 2023-01-17 with reprex v2.0.2
If you want to use outside = TRUE in annotation_logticks, you also need to turn clipping off.
From the docs for ?annotation_logticks
outside      logical that controls whether to move the log ticks outside of the plot area. Default is off (FALSE). You will also need to use coord_cartesian(clip = "off")
gpl +
annotation_logticks(sides="b", outside = TRUE) +
coord_cartesian(clip = "off")

R + Arrow 10 : convert blank to numeric NA

Please have a look at the reprex at the end of the post.
I need to read a column as a string, perform several manipulations and then save convert it to a numerical column.
The blanks ("") in the string column give me a headache because arrow does not convert them to numerical missing values NA.
Does anybody know how to achieve that?
Many thanks
library(tidyverse)
library(arrow)
#> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
df <- tibble(x=rep(c("4000 -", "6000 -", "", "8000 - "), 10),
y=seq(1,10, length=40))
write_csv(df, "test_string.csv")
data <- open_dataset("test_string.csv",
format="csv",
skip=1,
schema=schema(x=string(), y=double()))
data2 <- data |>
mutate(x= sub(" -.*", "", x) ) |>
mutate(x2=as.numeric(x)) |>
collect() ## how to convert the blank to a numeric NA ?
#> Error in `collect()`:
#> ! Invalid: Failed to parse string: '' as a scalar of type double
#> Backtrace:
#> ▆
#> 1. ├─dplyr::collect(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 2. └─arrow:::collect.arrow_dplyr_query(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 3. └─base::tryCatch(...)
#> 4. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 5. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 6. └─value[[3L]](cond)
#> 7. └─arrow:::augment_io_error_msg(e, call, schema = x$.data$schema)
#> 8. └─rlang::abort(msg, call = call)
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.13.so
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] arrow_10.0.0 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10
#> [5] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
#> [9] ggplot2_3.4.0 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] lubridate_1.9.0 assertthat_0.2.1 digest_0.6.30
#> [4] utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
#> [7] backports_1.4.1 reprex_2.0.2 evaluate_0.17
#> [10] httr_1.4.4 highr_0.9 pillar_1.8.1
#> [13] rlang_1.0.6 googlesheets4_1.0.1 readxl_1.4.1
#> [16] R.utils_2.12.1 R.oo_1.25.0 rmarkdown_2.17
#> [19] styler_1.8.0 googledrive_2.0.0 bit_4.0.4
#> [22] munsell_0.5.0 broom_1.0.1 compiler_4.2.2
#> [25] modelr_0.1.9 xfun_0.34 pkgconfig_2.0.3
#> [28] htmltools_0.5.3 tidyselect_1.2.0 fansi_1.0.3
#> [31] crayon_1.5.2 tzdb_0.3.0 dbplyr_2.2.1
#> [34] withr_2.5.0 R.methodsS3_1.8.2 grid_4.2.2
#> [37] jsonlite_1.8.3 gtable_0.3.1 lifecycle_1.0.3
#> [40] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1
#> [43] vroom_1.6.0 cli_3.4.1 stringi_1.7.8
#> [46] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2
#> [49] generics_0.1.3 vctrs_0.5.0 tools_4.2.2
#> [52] bit64_4.0.5 R.cache_0.16.0 glue_1.6.2
#> [55] hms_1.1.2 parallel_4.2.2 fastmap_1.1.0
#> [58] yaml_2.3.6 timechange_0.1.1 colorspace_2.0-3
#> [61] gargle_1.2.1 rvest_1.0.3 knitr_1.40
#> [64] haven_2.5.1
Created on 2022-11-07 with reprex v2.0.2
ifelse works here when all classes are correct (and not double()); if_else enforces this already, so we can use either.
data |>
mutate(x = sub(" -.*", "", x)) |>
mutate(
x = ifelse(x == "", NA_character_, x), # also if_else works
x2 = as.numeric(x)
) |>
collect()
# # A tibble: 40 x 3
# x y x2
# <chr> <dbl> <dbl>
# 1 4000 1 4000
# 2 6000 1.23 6000
# 3 NA 1.46 NA
# 4 8000 1.69 8000
# 5 4000 1.92 4000
# 6 6000 2.15 6000
# 7 NA 2.38 NA
# 8 8000 2.62 8000
# 9 4000 2.85 4000
# 10 6000 3.08 6000
# # ... with 30 more rows
Try using the read_csv instead of open_dataset
library(readr)
data <- read_csv("test_string.csv")

How to reverse factor levels for step_woe preprocessing recipe step

I am applying WOE (weight of evidence) transformation for my features (using 'step_woe' from the 'embed' package) within the 'recipes' framework, but by default it takes the 0 value as reference and thus the WOE values are reversed.
I am trying to relevel the target to set "1" as reference but the results are the same (no change in the direction of woe values). Any idea how to get it right?
Here is an example, first I create example dataset with a target (0's and 1's) and one feature ('yes', 'no') in perfect relationship with each other. Then I apply step_woe transformation while setting the reference level either '0' or '1' to compare the results with no difference.
library(tidyverse)
library(recipes)
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#>
#> fixed
#> The following object is masked from 'package:stats':
#>
#> step
library(embed)
example_df <-
tibble(
target = rbinom(1000, 1, 0.5),
feature = ifelse(target == 1, "yes", "no")
) %>%
mutate_all(as.factor) %>%
print()
#> # A tibble: 1,000 x 2
#> target feature
#> <fct> <fct>
#> 1 0 no
#> 2 1 yes
#> 3 0 no
#> 4 0 no
#> 5 1 yes
#> 6 0 no
#> 7 1 yes
#> 8 1 yes
#> 9 0 no
#> 10 0 no
#> # … with 990 more rows
woe_recipe_0 <-
recipe(target ~ feature, data = example_df) %>%
step_relevel(target, ref_level = "0") %>%
embed::step_woe(all_nominal_predictors(), outcome = "target") %>%
prep(., retain = FALSE)
tidy(woe_recipe_0, number = 2)
#> # A tibble: 2 x 10
#> terms value n_tot n_0 n_1 p_0 p_1 woe outcome id
#> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 feature no 493 493 0 1 0 20.0 target woe_nY7AB
#> 2 feature yes 507 0 507 0 1 -20.0 target woe_nY7AB
woe_recipe_1 <-
recipe(target ~ feature, data = example_df) %>%
step_relevel(target, ref_level = "1") %>%
embed::step_woe(all_nominal_predictors(), outcome = "target") %>%
prep(., retain = FALSE)
tidy(woe_recipe_1, number = 2)
#> # A tibble: 2 x 10
#> terms value n_tot n_0 n_1 p_0 p_1 woe outcome id
#> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 feature no 493 493 0 1 0 20.0 target woe_Lt6pK
#> 2 feature yes 507 0 507 0 1 -20.0 target woe_Lt6pK
sessionInfo()
#> R version 3.5.1 (2018-07-02)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Red Hat Enterprise Linux
#>
#> Matrix products: default
#> BLAS: /opt/R/3.5.1/lib64/R/lib/libRblas.so
#> LAPACK: /opt/R/3.5.1/lib64/R/lib/libRlapack.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] embed_0.1.5 recipes_0.1.17 forcats_0.4.0 stringr_1.4.0
#> [5] dplyr_1.0.7 purrr_0.3.4 readr_1.3.1 tidyr_1.1.2
#> [9] tibble_3.0.4 ggplot2_3.3.5 tidyverse_1.3.0
#>
#> loaded via a namespace (and not attached):
#> [1] httr_1.4.1 jsonlite_1.6 splines_3.5.1
#> [4] prodlim_2019.11.13 modelr_0.1.5 RcppParallel_5.0.2
#> [7] assertthat_0.2.1 highr_0.8 cellranger_1.1.0
#> [10] yaml_2.2.0 ipred_0.9-12 pillar_1.6.2
#> [13] backports_1.2.1 lattice_0.20-35 glue_1.5.1
#> [16] reticulate_1.13 digest_0.6.27 rvest_0.3.5
#> [19] colorspace_2.0-0 htmltools_0.4.0 Matrix_1.2-14
#> [22] timeDate_3043.102 pkgconfig_2.0.3 broom_0.7.6
#> [25] haven_2.2.0 scales_1.1.0 whisker_0.4
#> [28] gower_0.2.1 lava_1.6.6 generics_0.1.0
#> [31] ellipsis_0.3.2 withr_2.4.1 keras_2.2.5.0
#> [34] nnet_7.3-12 cli_2.4.0 survival_2.42-3
#> [37] magrittr_2.0.1 crayon_1.4.1 readxl_1.3.1
#> [40] evaluate_0.14 fs_1.3.1 fansi_0.4.2
#> [43] MASS_7.3-51.4 xml2_1.2.2 class_7.3-14
#> [46] tools_3.5.1 hms_1.1.1 lifecycle_1.0.1
#> [49] munsell_0.5.0 reprex_0.3.0 compiler_3.5.1
#> [52] rlang_0.4.12 grid_3.5.1 rstudioapi_0.11
#> [55] base64enc_0.1-3 rmarkdown_1.18 gtable_0.3.0
#> [58] DBI_1.1.1 R6_2.5.0 tfruns_1.4
#> [61] lubridate_1.7.4 knitr_1.26 tensorflow_2.0.0
#> [64] uwot_0.1.5 utf8_1.2.1 zeallot_0.1.0
#> [67] stringi_1.4.3 Rcpp_1.0.7 vctrs_0.3.8
#> [70] rpart_4.1-15 dbplyr_2.1.1 tidyselect_1.1.1.9000
#> [73] xfun_0.11
Created on 2022-02-02 by the reprex package (v0.3.0)

Need to generate a large quantity of data using the rnorm function

I have a data frame for which each element in the data frame represents a mean value. The data frame looks like this:
Mean_Data <- data.frame( Apple = runif(5), orange = runif(5),
Pear = runif(5), Banana = runif(5) )
I also have another data frame for which each element in this data frame represents an STD value correlating to each mean value in the Mean_Data data frame. This data frame looks like this:
STD_Data <- data.frame( Red = runif(5), Blue = runif(5),
Green = runif(5), Yellow = runif(5) )
I want to write a function that uses rnorm to generate 50 random numbers using the mean and STD value obtained from each cell of the Mean_Data and STD_Data data frames.
For instance, I want to repeatedly use rnorm in the following fashion:
rnorm(50, mean= Mean_Data[1,1], sd=STD_Data[1,1])
.
.
.
rnorm(50, mean= Mean_Data[5,5], sd=STD_Data[5,5])
Thus, the output data frame will have 1,000 observations in total. I am assuming that I would need to write a function that uses the apply function, however, I cannot figure out the correct syntax.
This is what I have so far:
Feat_Data <- function(Mean_Data, STD_Data) {
Feat_Data <- as.data.frame(apply(Mean_Data, 2, rnorm(50, mean=, sd=)))
return(Feat_Data)
}
However, I do not know the proper syntax needed to fill in the mean and sd values of the rnorm function. I want to populate these parameter values with the values in each cell of the Mean_Data and STD_Data data frames.
I think this should be close to what you want
library(tidyverse)
Mean_Data <- data.frame( Apple = runif(5), orange = runif(5),
Pear = runif(5), Banana = runif(5) )
STD_Data <- data.frame( Red = runif(5), Blue = runif(5),
Green = runif(5), Yellow = runif(5) )
temp_mean <- Mean_Data |> unname() |> as.matrix()
temp_std <- STD_Data |> unname() |> as.matrix()
temp_grid <- expand.grid(temp_mean,temp_std)
tidy_mean <- Mean_Data |>
pivot_longer(everything(),names_to = "names_mean",values_to = "values_mean") |>
arrange(names_mean)
tidy_sd <- STD_Data |>
pivot_longer(everything(),names_to = "names_sd",values_to = "values_sd") |>
arrange(names_sd)
tidy_simulation <- tidy_mean |>
bind_cols(tidy_sd)
tidy_simulation |>
mutate(generate = list(rnorm(50,mean = values_mean,sd = values_sd))) |>
unnest(generate)
#> # A tibble: 1,000 x 5
#> names_mean values_mean names_sd values_sd generate
#> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 Apple 0.102 Blue 0.439 -1.10
#> 2 Apple 0.102 Blue 0.439 1.63
#> 3 Apple 0.102 Blue 0.439 1.38
#> 4 Apple 0.102 Blue 0.439 0.595
#> 5 Apple 0.102 Blue 0.439 0.666
#> 6 Apple 0.102 Blue 0.439 -0.129
#> 7 Apple 0.102 Blue 0.439 1.12
#> 8 Apple 0.102 Blue 0.439 0.737
#> 9 Apple 0.102 Blue 0.439 0.0333
#> 10 Apple 0.102 Blue 0.439 1.84
#> # ... with 990 more rows
Created on 2021-05-31 by the reprex package (v2.0.0)
Session info
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 21390)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=Portuguese_Brazil.1252 LC_CTYPE=Portuguese_Brazil.1252
#> [3] LC_MONETARY=Portuguese_Brazil.1252 LC_NUMERIC=C
#> [5] LC_TIME=Portuguese_Brazil.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.6 purrr_0.3.4
#> [5] readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.3
#> [9] tidyverse_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.1.1 xfun_0.23 haven_2.4.1 colorspace_2.0-1
#> [5] vctrs_0.3.8 generics_0.1.0 htmltools_0.5.1.1 yaml_2.2.1
#> [9] utf8_1.2.1 rlang_0.4.11 pillar_1.6.1 glue_1.4.2
#> [13] withr_2.4.2 DBI_1.1.1 dbplyr_2.1.1 modelr_0.1.8
#> [17] readxl_1.3.1 lifecycle_1.0.0 munsell_0.5.0 gtable_0.3.0
#> [21] cellranger_1.1.0 rvest_1.0.0 evaluate_0.14 knitr_1.33
#> [25] ps_1.6.0 fansi_0.5.0 highr_0.9 broom_0.7.6
#> [29] Rcpp_1.0.6 backports_1.2.1 scales_1.1.1 jsonlite_1.7.2
#> [33] fs_1.5.0 hms_1.1.0 digest_0.6.27 stringi_1.6.2
#> [37] grid_4.1.0 cli_2.5.0 tools_4.1.0 magrittr_2.0.1
#> [41] crayon_1.4.1 pkgconfig_2.0.3 ellipsis_0.3.2 xml2_1.3.2
#> [45] reprex_2.0.0 lubridate_1.7.10 assertthat_0.2.1 rmarkdown_2.8
#> [49] httr_1.4.2 rstudioapi_0.13 R6_2.5.0 compiler_4.1.0
Use the Map approach -
do.call(rbind, Map(function(x, y) {
do.call(rbind, Map(function(p, q) rnorm(50, p, q), x, y))
}, Mean_Data, STD_Data)) -> result

Is there a way to get coordinates for raster values extracted from a polygon in R?

I have a shapefile of polygons that I want to use to extract raster values into a data frame. So I do that in the following code.
shp <- sf:st_read('example.shp')
r <- raster::raster('example.tif')
extract <- raster::extract(r, shp, df=TRUE)
This gives me a data frame of two columns: the numeric ID for each polygon and the associated extracted raster value. Now I would like to add the x, y coordinates for each extracted raster value. I have seen this done for point shapefiles but I am not sure how to apply it for polygon shapefile geometry.
Try this, I combined raster::extract(..., cellnumbers = TRUE) and raster::xyFromCell:
library(sf)
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(raster)
#> Loading required package: sp
library(giscoR)
# Dummy data
shp <-
gisco_get_nuts(country = "Netherlands",
nuts_level = 3,
epsg = 4326)[, 1]
r <- raster::getData("alt", country = "Netherlands")
#> Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
#> = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
#> definition
extract <- raster::extract(r, shp, df = TRUE, cellnumbers = TRUE)
# Order (for checking purposes)
extract <- extract[order(extract$cell),]
# Extract coordinates
xy <- xyFromCell(r, cell = extract$cell, spatial = FALSE)
# Convert to df and add cellnumber
xy <- as.data.frame(xy)
xy$cell <- extract$cell
# Merge two data frames
extract_end <- merge(extract, xy)
extract_end <- extract_end[order(extract_end$cell),]
# This is what you are looking for: extract_end is a data frame
# has values and x and y are coordinates
head(extract_end)
#> cell ID NLD_msk_alt x y
#> 1 2319 3 NA 6.620833 53.56250
#> 2 2796 3 NA 6.595833 53.55417
#> 3 2797 3 NA 6.604167 53.55417
#> 4 2798 3 NA 6.612500 53.55417
#> 5 2799 3 NA 6.620833 53.55417
#> 6 2800 3 NA 6.629167 53.55417
# Checks - can be ommited
nrow(extract) == nrow(extract_end)
#> [1] TRUE
# Check NAs in coordinates
nrow(extract_end[is.na(extract_end$x), ])
#> [1] 0
nrow(extract_end[is.na(extract_end$y), ])
#> [1] 0
# Convert to sf for checks
sfobj <- st_as_sf(extract_end, coords = c("x", "y"))
sfobj <- st_set_crs(sfobj, st_crs(4326))
# Plot as sf points
par(mar = c(3, 3, 3, 3))
plot(
sfobj[, 3],
axes = TRUE,
main = "sf points",
key.pos = 4,
breaks = "equal",
nbreaks = 100,
pal = rev(terrain.colors(100))
)
# Compare with raster plot
par(mar = c(3, 3, 3, 3))
plot(r, main = "Raster")
sessionInfo()
#> R version 4.0.3 (2020-10-10)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=Spanish_Spain.1252 LC_CTYPE=Spanish_Spain.1252
#> [3] LC_MONETARY=Spanish_Spain.1252 LC_NUMERIC=C
#> [5] LC_TIME=Spanish_Spain.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] giscoR_0.2.4-9000 raster_3.4-5 sp_1.4-5 sf_0.9-7
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.6 pillar_1.4.7 compiler_4.0.3 highr_0.8
#> [5] class_7.3-18 tools_4.0.3 digest_0.6.27 evaluate_0.14
#> [9] lifecycle_1.0.0 tibble_3.0.6 lattice_0.20-41 pkgconfig_2.0.3
#> [13] rlang_0.4.10 reprex_1.0.0 DBI_1.1.1 rgdal_1.5-23
#> [17] yaml_2.2.1 xfun_0.21 e1071_1.7-4 styler_1.3.2
#> [21] stringr_1.4.0 dplyr_1.0.4 knitr_1.31 generics_0.1.0
#> [25] fs_1.5.0 vctrs_0.3.6 classInt_0.4-3 grid_4.0.3
#> [29] tidyselect_1.1.0 glue_1.4.2 R6_2.5.0 rmarkdown_2.6
#> [33] purrr_0.3.4 magrittr_2.0.1 codetools_0.2-18 backports_1.2.1
#> [37] ellipsis_0.3.1 htmltools_0.5.1.1 units_0.6-7 assertthat_0.2.1
#> [41] countrycode_1.2.0 KernSmooth_2.23-18 stringi_1.5.3 crayon_1.4.1
Created on 2021-02-19 by the reprex package (v1.0.0)

Resources