Please have a look at the reprex at the end of the post.
I need to read a column as a string, perform several manipulations and then save convert it to a numerical column.
The blanks ("") in the string column give me a headache because arrow does not convert them to numerical missing values NA.
Does anybody know how to achieve that?
Many thanks
library(tidyverse)
library(arrow)
#> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
df <- tibble(x=rep(c("4000 -", "6000 -", "", "8000 - "), 10),
y=seq(1,10, length=40))
write_csv(df, "test_string.csv")
data <- open_dataset("test_string.csv",
format="csv",
skip=1,
schema=schema(x=string(), y=double()))
data2 <- data |>
mutate(x= sub(" -.*", "", x) ) |>
mutate(x2=as.numeric(x)) |>
collect() ## how to convert the blank to a numeric NA ?
#> Error in `collect()`:
#> ! Invalid: Failed to parse string: '' as a scalar of type double
#> Backtrace:
#> ▆
#> 1. ├─dplyr::collect(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 2. └─arrow:::collect.arrow_dplyr_query(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 3. └─base::tryCatch(...)
#> 4. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 5. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 6. └─value[[3L]](cond)
#> 7. └─arrow:::augment_io_error_msg(e, call, schema = x$.data$schema)
#> 8. └─rlang::abort(msg, call = call)
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.13.so
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] arrow_10.0.0 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10
#> [5] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
#> [9] ggplot2_3.4.0 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] lubridate_1.9.0 assertthat_0.2.1 digest_0.6.30
#> [4] utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
#> [7] backports_1.4.1 reprex_2.0.2 evaluate_0.17
#> [10] httr_1.4.4 highr_0.9 pillar_1.8.1
#> [13] rlang_1.0.6 googlesheets4_1.0.1 readxl_1.4.1
#> [16] R.utils_2.12.1 R.oo_1.25.0 rmarkdown_2.17
#> [19] styler_1.8.0 googledrive_2.0.0 bit_4.0.4
#> [22] munsell_0.5.0 broom_1.0.1 compiler_4.2.2
#> [25] modelr_0.1.9 xfun_0.34 pkgconfig_2.0.3
#> [28] htmltools_0.5.3 tidyselect_1.2.0 fansi_1.0.3
#> [31] crayon_1.5.2 tzdb_0.3.0 dbplyr_2.2.1
#> [34] withr_2.5.0 R.methodsS3_1.8.2 grid_4.2.2
#> [37] jsonlite_1.8.3 gtable_0.3.1 lifecycle_1.0.3
#> [40] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1
#> [43] vroom_1.6.0 cli_3.4.1 stringi_1.7.8
#> [46] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2
#> [49] generics_0.1.3 vctrs_0.5.0 tools_4.2.2
#> [52] bit64_4.0.5 R.cache_0.16.0 glue_1.6.2
#> [55] hms_1.1.2 parallel_4.2.2 fastmap_1.1.0
#> [58] yaml_2.3.6 timechange_0.1.1 colorspace_2.0-3
#> [61] gargle_1.2.1 rvest_1.0.3 knitr_1.40
#> [64] haven_2.5.1
Created on 2022-11-07 with reprex v2.0.2
ifelse works here when all classes are correct (and not double()); if_else enforces this already, so we can use either.
data |>
mutate(x = sub(" -.*", "", x)) |>
mutate(
x = ifelse(x == "", NA_character_, x), # also if_else works
x2 = as.numeric(x)
) |>
collect()
# # A tibble: 40 x 3
# x y x2
# <chr> <dbl> <dbl>
# 1 4000 1 4000
# 2 6000 1.23 6000
# 3 NA 1.46 NA
# 4 8000 1.69 8000
# 5 4000 1.92 4000
# 6 6000 2.15 6000
# 7 NA 2.38 NA
# 8 8000 2.62 8000
# 9 4000 2.85 4000
# 10 6000 3.08 6000
# # ... with 30 more rows
Try using the read_csv instead of open_dataset
library(readr)
data <- read_csv("test_string.csv")
Related
Please have a look at the reprex at the end of the post.
I generate some lognormally distributed values and then I bin the distribution using a non-uniform bin (the grid is evenly spaced if I take its logarithm).
The point is not the maths, but the fact that, using annotation_logticks
( see
https://ggplot2.tidyverse.org/reference/annotation_logticks.html
) I cannot add the ticks to the plot.
Does anybody understand what goes wrong?
Thanks a lot!
library(tidyverse)
library(scales)
#>
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#>
#> discard
#> The following object is masked from 'package:readr':
#>
#> col_factor
## auxiliary functions
scale_x_log10nice <- function(name=NULL,omag=seq(-20,20),...) {
breaks10 <- 10^omag
scale_x_log10(name,breaks=breaks10,
labels=scales::trans_format("log10", scales::math_format(10^.x)),...)
}
log_binning <- function(x_min,x_max,n_bin){
x_max <- x_max
m <- n_bin-1
r <- (x_max/x_min)^(1/m)
my_seq <- seq(0,m,by=1)
grid <- x_min*r^my_seq
}
##################################################à
set.seed(1234)
n_bins <- 10
df <- tibble(x=rlnorm(10e4, sdlog=2))
my_breaks2 <- log_binning(min(df$x),
max(df$x), n_bins)
gpl <- ggplot(df, aes(x=x )) +
theme_bw()+
geom_histogram(## binwidth=10e3,
colour="black", fill="blue"## , boundary=0
, breaks=my_breaks2
)+
scale_x_log10nice("x values")
gpl
gpl2 <- gpl+
annotation_logticks(sides="b", outside=T)
## where are the logticks?
gpl2
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] scales_1.2.1 forcats_0.5.2 stringr_1.5.0 dplyr_1.0.99.9000
#> [5] purrr_1.0.0 readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
#> [9] ggplot2_3.4.0 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.2.0 xfun_0.36 haven_2.5.1
#> [4] gargle_1.2.1 colorspace_2.0-3 vctrs_0.5.1
#> [7] generics_0.1.3 htmltools_0.5.4 yaml_2.3.6
#> [10] utf8_1.2.2 rlang_1.0.6 pillar_1.8.1
#> [13] glue_1.6.2 withr_2.5.0 DBI_1.1.3
#> [16] dbplyr_2.2.1 readxl_1.4.1 modelr_0.1.10
#> [19] lifecycle_1.0.3 munsell_0.5.0 gtable_0.3.1
#> [22] cellranger_1.1.0 rvest_1.0.3 evaluate_0.19
#> [25] labeling_0.4.2 knitr_1.41 tzdb_0.3.0
#> [28] fastmap_1.1.0 fansi_1.0.3 highr_0.10
#> [31] broom_1.0.2 backports_1.4.1 googlesheets4_1.0.1
#> [34] jsonlite_1.8.4 farver_2.1.1 fs_1.5.2
#> [37] hms_1.1.2 digest_0.6.31 stringi_1.7.8
#> [40] grid_4.2.2 cli_3.6.0 tools_4.2.2
#> [43] magrittr_2.0.3 crayon_1.5.2 pkgconfig_2.0.3
#> [46] ellipsis_0.3.2 xml2_1.3.3 reprex_2.0.2
#> [49] googledrive_2.0.0 lubridate_1.9.0 timechange_0.1.1
#> [52] assertthat_0.2.1 rmarkdown_2.19 httr_1.4.4
#> [55] R6_2.5.1 compiler_4.2.2
Created on 2023-01-17 with reprex v2.0.2
If you want to use outside = TRUE in annotation_logticks, you also need to turn clipping off.
From the docs for ?annotation_logticks
outside logical that controls whether to move the log ticks outside of the plot area. Default is off (FALSE). You will also need to use coord_cartesian(clip = "off")
gpl +
annotation_logticks(sides="b", outside = TRUE) +
coord_cartesian(clip = "off")
This question already has answers here:
Error: could not find function ... in R
(10 answers)
Closed 7 months ago.
I installed a R package named "GeoTcgaData", libraried it and wanted to use the function "fpkmToTPM_matrix" to convert my data. But this message came out:
Error in fpkmToTpm_matrix(J_Rseq) :
could not find function "fpkmToTpm_matrix"
Wondering where the problem is.
(The version of the package is 1.1.0)
You may need to install some of the dependencies separately via Bioconductor. If you install the packages using:
install.packages("BiocManager")
BiocManager::install(pkgs = c('DESeq2', 'impute', 'edgeR', 'cqn', 'topconfects', 'ChAMP', 'clusterProfiler',
'org.Hs.eg.db', 'minfi', 'IlluminaHumanMethylation450kanno.ilmn12.hg19',
'dearseq', 'NOISeq'))
You can then (hopefully) install and run GeoTcgaData as expected:
install.packages("GeoTcgaData", type = "source")
library(GeoTcgaData)
#> =============================================================
#> Hello, friend! welcome to use GeoTcgaData!
#> -------------------------------------------------------------
#> Version:1.1.0
#> =============================================================
lung_squ_count2 <- matrix(c(0.11,0.22,0.43,0.14,0.875,0.66,0.77,0.18,0.29),ncol=3)
rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
lung_squ_count2
#> sample1 sample2 sample3
#> DISC1 0.11 0.140 0.77
#> TCOF1 0.22 0.875 0.18
#> SPPL3 0.43 0.660 0.29
result <- fpkmToTpm_matrix(lung_squ_count2)
result
#> sample1 sample2 sample3
#> DISC1 144736.8 83582.09 620967.7
#> TCOF1 289473.7 522388.06 145161.3
#> SPPL3 565789.5 394029.85 233871.0
sessionInfo()
#> R version 4.1.3 (2022-03-10)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur/Monterey 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] GeoTcgaData_1.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.9 plyr_1.8.7 pillar_1.8.0 compiler_4.1.3
#> [5] highr_0.9 R.methodsS3_1.8.2 R.utils_2.12.0 tools_4.1.3
#> [9] digest_0.6.29 evaluate_0.15 lifecycle_1.0.1 tibble_3.1.8
#> [13] R.cache_0.16.0 pkgconfig_2.0.3 rlang_1.0.4 reprex_2.0.1
#> [17] DBI_1.1.3 cli_3.3.0 rstudioapi_0.13 yaml_2.3.5
#> [21] xfun_0.31 fastmap_1.1.0 withr_2.5.0 styler_1.7.0
#> [25] stringr_1.4.0 dplyr_1.0.9 knitr_1.39 generics_0.1.3
#> [29] fs_1.5.2 vctrs_0.4.1 tidyselect_1.1.2 glue_1.6.2
#> [33] R6_2.5.1 fansi_1.0.3 rmarkdown_2.14 purrr_0.3.4
#> [37] magrittr_2.0.3 htmltools_0.5.3 splines_4.1.3 assertthat_0.2.1
#> [41] utf8_1.2.2 nor1mix_1.3-0 stringi_1.7.8 cqn_1.38.0
#> [45] R.oo_1.25.0
Created on 2022-08-08 by the reprex package (v2.0.1)
If installing the dependencies separately doesn't solve the issue, another potential alternative is to define the function yourself. From the source code:
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
fpkmToTpm_matrix <- function(fpkm_matrix) {
fpkm_matrix_new <- apply(fpkm_matrix, 2, fpkmToTpm)
}
fpkmToTpm_matrix(J_Rseq)
Does this work?
Also, please edit your question to include the output of the command sessionInfo()
I am applying WOE (weight of evidence) transformation for my features (using 'step_woe' from the 'embed' package) within the 'recipes' framework, but by default it takes the 0 value as reference and thus the WOE values are reversed.
I am trying to relevel the target to set "1" as reference but the results are the same (no change in the direction of woe values). Any idea how to get it right?
Here is an example, first I create example dataset with a target (0's and 1's) and one feature ('yes', 'no') in perfect relationship with each other. Then I apply step_woe transformation while setting the reference level either '0' or '1' to compare the results with no difference.
library(tidyverse)
library(recipes)
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#>
#> fixed
#> The following object is masked from 'package:stats':
#>
#> step
library(embed)
example_df <-
tibble(
target = rbinom(1000, 1, 0.5),
feature = ifelse(target == 1, "yes", "no")
) %>%
mutate_all(as.factor) %>%
print()
#> # A tibble: 1,000 x 2
#> target feature
#> <fct> <fct>
#> 1 0 no
#> 2 1 yes
#> 3 0 no
#> 4 0 no
#> 5 1 yes
#> 6 0 no
#> 7 1 yes
#> 8 1 yes
#> 9 0 no
#> 10 0 no
#> # … with 990 more rows
woe_recipe_0 <-
recipe(target ~ feature, data = example_df) %>%
step_relevel(target, ref_level = "0") %>%
embed::step_woe(all_nominal_predictors(), outcome = "target") %>%
prep(., retain = FALSE)
tidy(woe_recipe_0, number = 2)
#> # A tibble: 2 x 10
#> terms value n_tot n_0 n_1 p_0 p_1 woe outcome id
#> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 feature no 493 493 0 1 0 20.0 target woe_nY7AB
#> 2 feature yes 507 0 507 0 1 -20.0 target woe_nY7AB
woe_recipe_1 <-
recipe(target ~ feature, data = example_df) %>%
step_relevel(target, ref_level = "1") %>%
embed::step_woe(all_nominal_predictors(), outcome = "target") %>%
prep(., retain = FALSE)
tidy(woe_recipe_1, number = 2)
#> # A tibble: 2 x 10
#> terms value n_tot n_0 n_1 p_0 p_1 woe outcome id
#> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 feature no 493 493 0 1 0 20.0 target woe_Lt6pK
#> 2 feature yes 507 0 507 0 1 -20.0 target woe_Lt6pK
sessionInfo()
#> R version 3.5.1 (2018-07-02)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Red Hat Enterprise Linux
#>
#> Matrix products: default
#> BLAS: /opt/R/3.5.1/lib64/R/lib/libRblas.so
#> LAPACK: /opt/R/3.5.1/lib64/R/lib/libRlapack.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] embed_0.1.5 recipes_0.1.17 forcats_0.4.0 stringr_1.4.0
#> [5] dplyr_1.0.7 purrr_0.3.4 readr_1.3.1 tidyr_1.1.2
#> [9] tibble_3.0.4 ggplot2_3.3.5 tidyverse_1.3.0
#>
#> loaded via a namespace (and not attached):
#> [1] httr_1.4.1 jsonlite_1.6 splines_3.5.1
#> [4] prodlim_2019.11.13 modelr_0.1.5 RcppParallel_5.0.2
#> [7] assertthat_0.2.1 highr_0.8 cellranger_1.1.0
#> [10] yaml_2.2.0 ipred_0.9-12 pillar_1.6.2
#> [13] backports_1.2.1 lattice_0.20-35 glue_1.5.1
#> [16] reticulate_1.13 digest_0.6.27 rvest_0.3.5
#> [19] colorspace_2.0-0 htmltools_0.4.0 Matrix_1.2-14
#> [22] timeDate_3043.102 pkgconfig_2.0.3 broom_0.7.6
#> [25] haven_2.2.0 scales_1.1.0 whisker_0.4
#> [28] gower_0.2.1 lava_1.6.6 generics_0.1.0
#> [31] ellipsis_0.3.2 withr_2.4.1 keras_2.2.5.0
#> [34] nnet_7.3-12 cli_2.4.0 survival_2.42-3
#> [37] magrittr_2.0.1 crayon_1.4.1 readxl_1.3.1
#> [40] evaluate_0.14 fs_1.3.1 fansi_0.4.2
#> [43] MASS_7.3-51.4 xml2_1.2.2 class_7.3-14
#> [46] tools_3.5.1 hms_1.1.1 lifecycle_1.0.1
#> [49] munsell_0.5.0 reprex_0.3.0 compiler_3.5.1
#> [52] rlang_0.4.12 grid_3.5.1 rstudioapi_0.11
#> [55] base64enc_0.1-3 rmarkdown_1.18 gtable_0.3.0
#> [58] DBI_1.1.1 R6_2.5.0 tfruns_1.4
#> [61] lubridate_1.7.4 knitr_1.26 tensorflow_2.0.0
#> [64] uwot_0.1.5 utf8_1.2.1 zeallot_0.1.0
#> [67] stringi_1.4.3 Rcpp_1.0.7 vctrs_0.3.8
#> [70] rpart_4.1-15 dbplyr_2.1.1 tidyselect_1.1.1.9000
#> [73] xfun_0.11
Created on 2022-02-02 by the reprex package (v0.3.0)
This has been asked several times in different forms, but here is a very simple example. I have used this function several times in the past, but apparently it fails miserably in the simplest case.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
bind_df_list <- function(ll){
res <- map_df(ll, bind_rows)
return(res)
}
ll<-list(seq(4), seq(4), seq(4))
dd<-bind_df_list(ll)
#> Error: Argument 1 must have names.
print(sessionInfo())
#> R version 4.0.4 (2021-02-15)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 10 (buster)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.3.5.so
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] purrr_0.3.4 dplyr_1.0.4
#>
#> loaded via a namespace (and not attached):
#> [1] knitr_1.31 magrittr_2.0.1 tidyselect_1.1.0 R6_2.5.0
#> [5] rlang_0.4.10 stringr_1.4.0 styler_1.3.2 highr_0.8
#> [9] tools_4.0.4 xfun_0.21 DBI_1.1.1 htmltools_0.5.1.1
#> [13] ellipsis_0.3.1 assertthat_0.2.1 yaml_2.2.1 digest_0.6.27
#> [17] tibble_3.0.6 lifecycle_1.0.0 crayon_1.4.1 vctrs_0.3.6
#> [21] fs_1.5.0 glue_1.4.2 evaluate_0.14 rmarkdown_2.6
#> [25] reprex_1.0.0 stringi_1.5.3 compiler_4.0.4 pillar_1.4.7
#> [29] generics_0.1.0 backports_1.2.1 pkgconfig_2.0.3
Created on 2021-03-08 by the reprex package (v1.0.0)
Any idea about how to fix this?
Many thanks!
it is just a vector and ?bind_rows input should be a set of dataframes
... - Data frames to combine.
One option is to loop over the list, with map, convert to data.frame and append them by rows with _dfr
library(purrr)
map_dfr(ll, as.data.frame.list)
Or transpose and convert to data.frame
map_dfr(ll, ~ as.data.frame(t(.x)))
Or this can also be a named vector
map_dfr(ll, ~ setNames(.x, seq_along(.x)))
Or use rbind with do.call from base R as rbind have methods for matrix and data.frame
do.call(rbind, ll)
do.call(rbind.data.frame, ll)
For some reason stat_density_2d() does not seem to be working right for me when specifying geom = "polygon" and I am absolutely stumped. Here is my code...
library(sf)
library(tidyverse)
library(RANN2)
library(hexbin)
library(mapproj)
options(stringsAsFactors = FALSE)
raleigh_police <- rgdal::readOGR("https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson", "OGRGeoJSON")
raleigh_police_sf <- raleigh_police %>%
st_as_sf()
raleigh_police_sf %>%
filter(crime_description == "Burglary/Residential") %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
stat_density_2d(aes(X, Y, fill = stat(level)), geom = "polygon")
Here is my sessionInfo()...
R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS 10.14.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 mapproj_1.2.6 maps_3.3.0 hexbin_1.27.2 RANN2_0.1 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
[9] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1 sf_0.7-1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.0 lubridate_1.7.4 lattice_0.20-38 class_7.3-14 utf8_1.1.4 assertthat_0.2.0 rprojroot_1.3-2
[8] digest_0.6.18 R6_2.3.0 cellranger_1.1.0 plyr_1.8.4 backports_1.1.2 evaluate_0.12 e1071_1.7-0
[15] httr_1.3.1 blogdown_0.9 pillar_1.3.0 rlang_0.3.0.1 lazyeval_0.2.1 readxl_1.1.0 rstudioapi_0.8
[22] rmarkdown_1.10 labeling_0.3 rgdal_1.3-6 munsell_0.5.0 broom_0.5.0 compiler_3.5.1 modelr_0.1.2
[29] xfun_0.4 pkgconfig_2.0.2 htmltools_0.3.6 tidyselect_0.2.5 bookdown_0.7 codetools_0.2-15 fansi_0.4.0
[36] crayon_1.3.4 withr_2.1.2 MASS_7.3-51.1 grid_3.5.1 nlme_3.1-137 spData_0.2.9.4 jsonlite_1.5
[43] gtable_0.2.0 DBI_1.0.0 magrittr_1.5 units_0.6-1 scales_1.0.0 cli_1.0.1 stringi_1.2.4
[50] sp_1.3-1 xml2_1.2.0 tools_3.5.1 glue_1.3.0 hms_0.4.2 yaml_2.2.0 colorspace_1.3-2
[57] classInt_0.2-3 rvest_0.3.2 knitr_1.20 bindr_0.1.1 haven_1.1.2
I just get a blank plot with nothing on it. Completely stumped. What am I doing wrong here?
Update (2018-11-18)
It turns out that the primary issue here was options(stringsAsFactors = FALSE). If you comment that out and run the original code, everything actually works fine. I found this GitHub Issue which was the reason I tried that. Much more efficient code solutions are provided in the answers to this question and they also made sure not to use options(stringsAsFactors = FALSE).
Aside from downloading the file before reading and changing readOGR to read_sf, it works as-is for me save warnings for a couple NA points caused by empty geometries:
library(tidyverse)
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
path <- "~/Downloads/raleigh.geojson"
download.file(
"https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson",
path,
method = "curl"
)
raleigh_police <- sf::read_sf(path, "OGRGeoJSON")
raleigh_police %>%
filter(crime_description == "Burglary/Residential") %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
stat_density_2d(aes(X, Y, fill = stat(level)), geom = "polygon")
#> Warning: Removed 5 rows containing non-finite values (stat_density2d).
The empty rows:
raleigh_police %>%
filter(crime_description == "Burglary/Residential",
st_is_empty(.))
#> Simple feature collection with 5 features and 21 fields (with 5 geometries empty)
#> geometry type: POINT
#> dimension: XY
#> bbox: xmin: NA ymin: NA xmax: NA ymax: NA
#> epsg (SRID): 4326
#> proj4string: +proj=longlat +datum=WGS84 +no_defs
#> # A tibble: 5 x 22
#> OBJECTID GlobalID case_number crime_category crime_code crime_descripti…
#> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 205318 8057315… P14076062 BURGLARY/RESI… 30B Burglary/Reside…
#> 2 417488 70afb27… P15027702 BURGLARY/RESI… 30B Burglary/Reside…
#> 3 424718 bdf69fa… P18029113 BURGLARY/RESI… 30B Burglary/Reside…
#> 4 436550 711c05b… P18044139 BURGLARY/RESI… 30B Burglary/Reside…
#> 5 442091 9d7a008… P18051764 BURGLARY/RESI… 30B Burglary/Reside…
#> # … with 16 more variables: crime_type <chr>, reported_block_address <chr>,
#> # city_of_incident <chr>, city <chr>, district <chr>, reported_date <dttm>,
#> # reported_year <int>, reported_month <int>, reported_day <int>,
#> # reported_hour <int>, reported_dayofwk <chr>, latitude <dbl>,
#> # longitude <dbl>, agency <chr>, updated_date <dttm>, geometry <POINT [°]>
sessionInfo()
#> R version 3.5.1 (2018-07-02)
#> Platform: x86_64-apple-darwin15.6.0 (64-bit)
#> Running under: macOS 10.14.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] sf_0.7-1 forcats_0.3.0 stringr_1.3.1
#> [4] dplyr_0.7.99.9000 purrr_0.2.5 readr_1.2.0
#> [7] tidyr_0.8.2 tibble_1.4.99.9005 ggplot2_3.1.0
#> [10] tidyverse_1.2.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_0.2.5 haven_1.1.2 lattice_0.20-35
#> [4] colorspace_1.3-2 htmltools_0.3.6 yaml_2.2.0
#> [7] rlang_0.3.0.1 e1071_1.7-0 pillar_1.3.0.9001
#> [10] glue_1.3.0 withr_2.1.2 DBI_1.0.0
#> [13] modelr_0.1.2 readxl_1.1.0 plyr_1.8.4
#> [16] munsell_0.5.0 gtable_0.2.0 cellranger_1.1.0
#> [19] rvest_0.3.2 evaluate_0.12 labeling_0.3
#> [22] knitr_1.20 class_7.3-14 broom_0.5.0
#> [25] Rcpp_0.12.19.3 scales_1.0.0 backports_1.1.2
#> [28] classInt_0.2-3 jsonlite_1.5 hms_0.4.2.9001
#> [31] digest_0.6.18 stringi_1.2.4 grid_3.5.1
#> [34] rprojroot_1.3-2 cli_1.0.1 tools_3.5.1
#> [37] magrittr_1.5 lazyeval_0.2.1 crayon_1.3.4
#> [40] pkgconfig_2.0.2 MASS_7.3-51 xml2_1.2.0
#> [43] spData_0.2.9.4 lubridate_1.7.4 assertthat_0.2.0
#> [46] rmarkdown_1.10 httr_1.3.1 R6_2.3.0
#> [49] units_0.6-1 nlme_3.1-137 compiler_3.5.1
As noted, this is just point-level data and the CSV they provide can be a fine substitute:
library(tidyverse)
rp_csv_url <- "https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.csv"
httr::GET(
url = rp_csv_url,
httr::write_disk(basename(rp_csv_url)), # won't overwrite if it exists unless explicitly told to so you get caching for free
httr::progress() # I suspect this is a big file so it's nice to see a progress bar
)
raleigh_police <- read_csv(basename(rp_csv_url))
mutate(
raleigh_police,
longitude = as.numeric(longitude), # they come in wonky, still
latitude = as.numeric(latitude) # they come in wonky, still
) -> raleigh_police
raleigh_police %>%
filter(crime_description == "Burglary/Residential") %>%
ggplot() +
stat_density_2d(
aes(longitude, latitude, fill = stat(level)),
color = "#2b2b2b", size=0.125, geom = "polygon"
) +
viridis::scale_fill_viridis(direction=-1, option="magma") +
hrbrthemes::theme_ipsum_rc()
If you'd like to turn level into something more meaningful:
h <- c(MASS::bandwidth.nrd(rp_br$longitude),
MASS::bandwidth.nrd(rp_br$latitude))
dens <- MASS::kde2d(
rp_br$longitude, rp_br$latitude, h = h, n = 100
)
breaks <- pretty(range(dens$z), 10)
zdf <- data.frame(expand.grid(x = dens$x, y = dens$y), z = as.vector(dens$z))
z <- tapply(zdf$z, zdf[c("x", "y")], identity)
cl <- grDevices::contourLines(
x = sort(unique(dens$x)), y = sort(unique(dens$y)), z = dens$z,
levels = breaks
)
sp::SpatialPolygons(
lapply(1:length(cl), function(idx) {
sp::Polygons(
srl = list(sp::Polygon(
matrix(c(cl[[idx]]$x, cl[[idx]]$y), nrow=length(cl[[idx]]$x), byrow=FALSE)
)),
ID = idx
)
})
) -> cont
sp::coordinates(rp_br) <- ~longitude+latitude
then:
data_frame(
ct = sapply(sp::over(cont, sp::geometry(rp_br), returnList = TRUE), length),
id = 1:length(ct),
lvl = sapply(cl, function(x) x$level)
) %>%
count(lvl, wt=ct) %>%
mutate(
pct = n/nrow(rp_br),
pct_lab = sprintf("%s of the points fall within this level", scales::percent(pct))
)
## # A tibble: 10 x 4
## lvl n pct pct_lab
## <dbl> <int> <dbl> <chr>
## 1 10. 7302 0.927 92.7% of the points fall within this level
## 2 20. 6243 0.792 79.2% of the points fall within this level
## 3 30. 4786 0.607 60.7% of the points fall within this level
## 4 40. 3204 0.407 40.7% of the points fall within this level
## 5 50. 1945 0.247 24.7% of the points fall within this level
## 6 60. 1277 0.162 16.2% of the points fall within this level
## 7 70. 793 0.101 10.1% of the points fall within this level
## 8 80. 474 0.0601 6.0% of the points fall within this level
## 9 90. 279 0.0354 3.5% of the points fall within this level
## 10 100. 44 0.00558 0.6% of the points fall within this level