download.file with wildcard matching in R - r

I'm trying to download all the files that match a pattern from a url directory in R using download.file, but I can't get it working for even a single file. The url is:
https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/
and the pattern match is all files like: AIS_2019_*_18.zip
Here is what I've tried for a single file case:
download.file('https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_04_18.zip',
destfile = "AIS_2019_04_18.zip",
method = "wget", extra = c("-r", "-np", "-L", "--max-redirect=0"))
but I always get 'wget' call had nonzero exit status
I've also tried setting method = internal and mode = w, but get ```scheme not supported in url'

Here's a way to generate all the links that you can then loop through them with a for loop.
library(glue)
library(stringr)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
# Setup
month_dates <- glue("2019-{str_pad(1:12, width = 2, pad = '0')}-01")
days_in_months <- days_in_month(as.Date(month_dates))
# Get appropriate number of days and months combinations
months <- rep(1:12, days_in_months)
days <- unlist(mapply(function(x) str_pad(1:x, width = 2, pad = "0"),
days_in_months))
base_url <- "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019"
# Put everything together
all_files <- glue("{base_url}/AIS_2019_{months}_{days}.zip")
# See results
head(all_files)
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_01.zip
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_02.zip
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_03.zip
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_04.zip
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_05.zip
#> https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2019/AIS_2019_1_06.zip
# Check number of the days in a year is correct
length(all_files)
#> [1] 365
Created on 2021-08-04 by the reprex package (v2.0.0)
Once you have those created, you can do something like:
# Untested
for (file in all_files) {
download.file(file,
destfile = basename(file),
extra = c("-r", "-np", "-L", "--max-redirect=0"))
}

Related

Reading PDF portfolio in R

Is it possible to read/convert PDF portfolios in R?
I usually use pdftools, however, I get an error:
library(pdftools)
#> Using poppler version 0.73.0
link <- c("http://www.accessdata.fda.gov/cdrh_docs/pdf19/K190072.pdf")
pdftools::pdf_convert(link, dpi = 600)
#> Converting page 1 to K190072_1.png...
#> PDF error: Non conformant codestream TPsot==TNsot.<0a>
#> PDF error: Non conformant codestream TPsot==TNsot.<0a>
#> PDF error: Non conformant codestream TPsot==TNsot.<0a>
#> PDF error: Non conformant codestream TPsot==TNsot.<0a>
#> done!
#> [1] "K190072_1.png"
Created on 2021-05-06 by the reprex package (v1.0.0)
The K190072_1.png I finally get is only the image of the portfolio front page.
I am interessted in the document K190072.510kSummary.Final_Sent001.pdf of this PDF portfolio
I found a way for Python (Reading a PDF Portfolio in Python?) but I would really like to do that in R.
Thank you for your help.
There seems to be an issue with pdf_convert handling one-page raw pdf data (it wants to use basename(pdf) under these conditions), so I have edited that function so that it also works with the second attached pdf file.
If you only need the first file then you could run this with the original pdf_convert function, but it will give an error with the second file.
If you are interested in rendering raster graphics from the attached files this worked for me:
library(pdftools)
#> Using poppler version 21.02.0
link <- c("http://www.accessdata.fda.gov/cdrh_docs/pdf19/K190072.pdf")
pdf_convert <- function (pdf, format = "png", pages = NULL, filenames = NULL,
dpi = 72, antialias = TRUE, opw = "", upw = "", verbose = TRUE) {
config <- poppler_config()
if (!config$can_render || !length(config$supported_image_formats))
stop("You version of libppoppler does not support rendering")
format <- match.arg(format, poppler_config()$supported_image_formats)
if (is.null(pages))
pages <- seq_len(pdf_info(pdf, opw = opw, upw = upw)$pages)
if (!is.numeric(pages) || !length(pages))
stop("Argument 'pages' must be a one-indexed vector of page numbers")
if (length(filenames) < 2 & !is.raw(pdf)) { # added !is.raw(pdf)
input <- sub(".pdf", "", basename(pdf), fixed = TRUE)
filenames <- if (length(filenames)) {
sprintf(filenames, pages, format)
}
else {
sprintf("%s_%d.%s", input, pages, format)
}
}
if (length(filenames) != length(pages))
stop("Length of 'filenames' must be one or equal to 'pages'")
antialiasing <- isTRUE(antialias) || isTRUE(antialias ==
"draw")
text_antialiasing <- isTRUE(antialias) || isTRUE(antialias ==
"text")
pdftools:::poppler_convert(pdftools:::loadfile(pdf), format, pages, filenames,
dpi, opw, upw, antialiasing, text_antialiasing, verbose)
}
lapply(pdf_attachments(link), function(x) pdf_convert(x$data,
filenames=paste0(tools::file_path_sans_ext(x$name), "-",
seq_along(pdf_data(x$data)), ".png")))
#> Converting page 1 to K190072.510kSummary.Final_Sent001-1.png... done!
#> Converting page 2 to K190072.510kSummary.Final_Sent001-2.png... done!
#> Converting page 3 to K190072.510kSummary.Final_Sent001-3.png... done!
#> Converting page 4 to K190072.510kSummary.Final_Sent001-4.png... done!
#> Converting page 5 to K190072.510kSummary.Final_Sent001-5.png... done!
#> Converting page 1 to K190072.IFU.FINAL_Sent001-1.png... done!
#> Converting page 1 to K190072.Letter.SE.FINAL_Sent001-1.png... done!
#> Converting page 2 to K190072.Letter.SE.FINAL_Sent001-2.png... done!
#> [[1]]
#> [1] "K190072.510kSummary.Final_Sent001-1.png"
#> [2] "K190072.510kSummary.Final_Sent001-2.png"
#> [3] "K190072.510kSummary.Final_Sent001-3.png"
#> [4] "K190072.510kSummary.Final_Sent001-4.png"
#> [5] "K190072.510kSummary.Final_Sent001-5.png"
#>
#> [[2]]
#> [1] "K190072.IFU.FINAL_Sent001-1.png"
#>
#> [[3]]
#> [1] "K190072.Letter.SE.FINAL_Sent001-1.png"
#> [2] "K190072.Letter.SE.FINAL_Sent001-2.png"
Created on 2021-05-05 by the reprex package (v2.0.0)

Error despite purrr's 'otherwise' - Why is purrr/possibly's 'otherwise' not triggered?

I am scraping content from websites. For this I iterate over links. If an error occurs, purrr's possibly adverb should keep the process going, and place a "missing" (or "NA_character") as a result.
The code below works as intended when the site linked to is not existing, i.e. the output is "missing";
However, if the site linked to exists, but the element which I am trying to extract from the site does not exist, the function throws an error despite having defined a value for 'otherwise'.
To me this is surprising, since the documentation states that
' possibly : wrapped function uses a default value ( otherwise ) whenever an error occurs.'
Any idea why this is happening? I understand that i could modify the function accordingly (e.g. check for the length of the returned object). But I do not understand why the 'otherwise' value was not used.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.4
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(rvest)
#> Warning: package 'rvest' was built under R version 4.0.4
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
# possibly with wrong links when scraping site ----------------------------
#see https://github.com/tidyverse/purrr/issues/409
sample_data <- tibble::tibble(
link = c(
#link ok, selected item exists
"https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00068/index.shtml#tab-Sten.Protokoll",
#link not ok
"https://www.wrong-url.foobar",
#link ok, selected item does not exist on site
"https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Protokoll"
)
)
fn_get_link_to_records <- function(link_to_overview_sessions) {
print(link_to_overview_sessions)
link_to_overview_sessions %>%
rvest::read_html() %>%
rvest::html_elements("a") %>%
rvest::html_attr("href") %>%
enframe(name = NULL,
value = "link_to_text") %>%
filter(str_detect(link_to_text, regex("\\/NRSITZ_\\d+\\/fnameorig_\\d+\\.html$"))) %>%
mutate(link_to_text=glue::glue("https://www.parlament.gv.at/{link_to_text}")) %>%
pull()
}
sample_data %>%
mutate(link_to_text=map_chr(link,
possibly(fn_get_link_to_records,
otherwise=NA_character_)))
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00068/index.shtml#tab-Sten.Protokoll"
#> [1] "https://www.wrong-url.foobar"
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Protokoll"
#> Error: Problem with `mutate()` input `link_to_text`.
#> x Result 3 must be a single string, not a vector of class `glue/character` and of length 0
#> i Input `link_to_text` is `map_chr(link, possibly(fn_get_link_to_records, otherwise = NA_character_))`.
sample_data %>%
mutate(link_to_text=map_chr(link,
possibly(fn_get_link_to_records,
otherwise="missing")))
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00068/index.shtml#tab-Sten.Protokoll"
#> [1] "https://www.wrong-url.foobar"
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Protokoll"
#> Error: Problem with `mutate()` input `link_to_text`.
#> x Result 3 must be a single string, not a vector of class `glue/character` and of length 0
#> i Input `link_to_text` is `map_chr(link, possibly(fn_get_link_to_records, otherwise = "missing"))`.
Created on 2021-03-28 by the reprex package (v1.0.0)
UPDATE: I added the output below to make the unexpected result (last chunk) clearer.
sample_data[1:2,] %>%
mutate(link_to_text=map_chr(link,
possibly(fn_get_link_to_records,
otherwise="missing")))
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00068/index.shtml#tab-Sten.Protokoll"
#> [1] "https://www.wrong-url.foobar"
#> # A tibble: 2 x 2
#> link link_to_text
#> <chr> <chr>
#> 1 https://www.parlament.gv.at/PAKT/VHG~ https://www.parlament.gv.at//PAKT/VHG/X~
#> 2 https://www.wrong-url.foobar missing
sample_data[3, ] %>%
mutate(link_to_text=map_chr(link,
possibly(fn_get_link_to_records,
otherwise="missing")))
#> [1] "https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Protokoll"
#> Error: Problem with `mutate()` input `link_to_text`.
#> x Result 1 must be a single string, not a vector of class `glue/character` and of length 0
#> i Input `link_to_text` is `map_chr(link, possibly(fn_get_link_to_records, otherwise = "missing"))`.
Created on 2021-03-29 by the reprex package (v1.0.0)
The error is coming from map_chr but you have possibly wrapped around fn_get_link_to_records function. If you run fn_get_link_to_records(sample_data$link[3]) you'll see the URL get's printed and nothing is returned and no error is generated. However, map_chr cannot change this empty output to character value hence you get the error. Instead of map_chr if you use map you'll see it works.
sample_data[3,] %>%
mutate(link_to_text= map(link, fn_get_link_to_records))
#[1] #"https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Protokoll"
# A tibble: 1 x 2
# link link_to_text
# <chr> <list>
#1 https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00094/index.shtml#tab-Sten.Pro… <glue [0]>
but link_to_text is empty. The solution as you already know is check length of output value and return NA or generate an error inside fn_get_link_to_records functions for such cases which will be handled using possibly.

cronR add_cron doesn't execute script at all Mac OS High Sierra

I'm trying to create a cronjob using cronR. When I do it without addin my cronjob never starts
library(cronR)
script <- "~/Documents/Job/MammyClub/Ostatki/Rscripts/XML/AnitaXML.R"
cmd <- cron_rscript(script)
cron_add(command = cmd, at = "17:46", frequency = "minutely", id = "test_job1")
#> Adding cronjob:
#> ---------------
#>
#> ## cronR job
#> ## id: test_job1
#> ## tags:
#> ## desc:
#> 46 17 * * * /Library/Frameworks/R.framework/Resources/bin/Rscript '~/Documents/Job/MammyClub/Ostatki/Rscripts/XML/AnitaXML.R' >> '~/Documents/Job/MammyClub/Ostatki/Rscripts/XML/AnitaXML.log' 2>&1
Cron is running on the machine
Macintosh:~ aleksandr$ sudo cron start
Password:
cron: cron already running, pid: 240
Macintosh:~ aleksandr$
When I trying to create cronjob using addin the error occurs
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Attaching package: ‘gmailr’
The following object is masked from ‘package:dplyr’:
id
The following object is masked from ‘package:utils’:
history
The following objects are masked from ‘package:base’:
body, date, labels, message
Auto-refreshing stale OAuth token.
[1] "price-2021-01-12.xlsx"
Execution halted
The script itself runs well in RStudio and via Terminal too
options(java.parameters = "-Xmx1024m")
library(XML)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(gmailr)
#>
#> Attaching package: 'gmailr'
#> The following object is masked from 'package:dplyr':
#>
#> id
#> The following object is masked from 'package:utils':
#>
#> history
#> The following objects are masked from 'package:base':
#>
#> body, date, labels, message
gm_auth_configure()
gm_auth()
# ищем конкретное письмо
thread <- gm_threads(search = "magbabyopt", #поиск по поставщику
label_ids = "Label_8632852764550356173", #папка остатки
num_results = 1) #самое крайнее письмо
thread_id <- gm_id(thread) #id письма с которым будем работать
my_message <- gm_message(thread_id) #данные этого письма
att_ids <- gm_attachments(my_message) #данные по всем вложениям письма
att_ids$filename
#> [1] "price-2021-01-12.xlsx"
#если вложений несколько нужно писать паттерн по выбору
my_attach <- gm_attachment(att_ids$id, thread_id)
#сохраняем вложение, если вложений несолько, то пишем паттерн как выше для пути (filename)
gm_save_attachment(my_attach, glue::glue("~/Downloads/{filename}", filename = att_ids$filename))
ostatki <- XLConnect::loadWorkbook(filename = glue::glue("~/Downloads/{filename}",
filename = att_ids$filename))%>%
XLConnect::readWorksheet(., sheet = 1)%>%
select(., article = `Артикул`, quantity = `ОстатокНаСкладе`,
cost_price = `Цена3Значение`, price = `Цена1Значение`)%>%
mutate(., article = trimws(article, "both"), quantity = as.numeric(quantity))%>%
filter(quantity > 3)%>%
mutate(., across(everything(), as.character))
# Делаем XML
#new xml
ostatki_xml <- newXMLDoc()
# items (table data) node
items_node <- newXMLNode("items", doc = ostatki_xml)
# rows (names/values) node
item_data <- apply(ostatki, 1, function(x){
z <- newXMLNode("item") # создаем данные в каждый node по строкам
addChildren(z, lapply(names(x), function(y) newXMLNode(y, x[y])))
})
xmlParent(item_data) <- items_node #записываем данные в каждый node
saveXML(ostatki_xml, file = "~/Downloads/Magbaby.xml")
#> [1] "/Users/aleksandr/Downloads/Magbaby.xml"
RCurl::ftpUpload(to = "ftp://mammyclub.com/www/www.mammyclub.com/web/uploads/xmls/Magbaby.xml",
what = "~/Downloads/Magbaby.xml", userpwd = "secret_key))")
#> OK
#> 0
#удаляем файл
unlink(c(glue::glue("~/Downloads/{filename}", filename = att_ids$filename),"~/Downloads/Magbaby.xml"))
#rm(list = ls())
What is the problem?
.xls file with data the script works with has cyrillic encoding
I have to add language settings in crontab using crontab -e in Terminal, I think. But what settings?
The problem was precisely to write right locale for Cyrillic encoding in crontab
LANG=ru_RU.UTF-8
LC_ALL=ru_RU.UTF-8

Unable to plot heat map

library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
Apartment_no <- c("1-SV","1-SV","1-SV","1-SH","1-SH","1-SH","1-1V","1-1V","1-1V","1-1H","1-1H","1-1H","3-SV","3-SV","3-SV","3-1V","3-1V","3-1V","3-1H","3-1H","3-1H")
month <- c("September","October","November","September","October","November","September","October","November","September","October","November","September","October","November","September","October","November","September","October","November")
Days <- c(NA,19,28,2,19,28,2,19,NA,2,19,28,25,31,28,12,NA,24,8,26,19)
Heat_clean <- data.frame(Apartment_no,month,Days)
This is just a sample dataset. In actual, I have around 163 apartment_no and all the months of data. I wish to create an interactive heatmap for it since the data is quite big. I wish to use a special color code i.e whenever the Days==NA, color=Red,1<=Days<=5,color=Blue,6<=Days<=15,color=Orange,16<=Days<=25,color=Pink, 25<=Days<=31,color=Green. I have used the following code but it is not working for me & infact giving me an error message as "`x' must be a numeric matrix". My code is as follows:
> mypallete <- colorRampPalette(Days=na.color,col="Red", 1<=Days<=5,col="Blue", 6<=Days<=15,col="Orange", 16<=Days<=25,col="Pink", 25<=Days<=31,color=Green)
> heatmap.2(as.matrix(Heat_clean),Rowv = F,Colv = F,main = "Heatmeters data",col = mypallete,dendrogram = "none",density.info = "none",trace = "none")
I first defined my own pallete & then implemented it in my code. I wish to get something that can be seen in figure, though with my personalised color coding. Maybe later I can insert the code in plot_ly to get it interactive.
Your ranges are not exclusive (Days <= 25 and 25 <= Days) but that is easily fixed...
I don't think the structure of your data matches what heatmap.2 is expecting, and that is what is giving you the error, not the color map.
Here is one brute-force way to generate the color palette... (note I changed the spelling of mypalette)
mypalette=rep("Green",length(Days))
mypalette[Days <= 25] = "Pink"
mypalette[Days <= 15] = "Orange"
mypalette[Days <= 5] = "Blue"
mypalette[is.na(Days)] = "Red"

Failed two methods to subset dataset with R, requesting assistance

I am attempting to make a subset of some data in R (open source statistics scripting language). I attempt two methods, but I am unsuccessful with both. One returns a table with no data, the other returns a table of all "NA" cells, but of the apparently correct dimensions.
I laid out the code pretty clearly commented--
First, I create the list of zip codes I'll use to subset the data. The list of zip codes is from a dataset I'll be using.
The list of zip codes is called "zipCodesOfData"
Next, I download the Crime Data I'll be subsetting. I basically just subset it into the data set that I need.
The last part, section three, shows that I try both %in% and the filter method to filter the Crime Data against the zip code data.
Unfortunately, neither method works. I was hoping someone might be able to point out my mistakes or recommend a different subsetting method for the third section.
(As an aside, in section two, I attempt to turn the list into a dataframe, but it does not work. I'm curious as to why, if anyone can shed light onto this for me.)
Thanks for your time & assistance!
####
#### Section zero: references and dependencies
####
# r's "choroplethr" library creator's blog for reference:
# http://www.arilamstein.com/blog/2015/06/25/learn-to-map-census-data-in-r/
# http://stackoverflow.com/questions/30787877/making-a-zip-code-choropleth-in-r-using-ggplot2-and-ggmap
#
# library(choroplethr)
# library(choroplethrMaps)
# library(ggplot2)
# # use the devtools package from CRAN to install choroplethrZip from github
# # install.packages("devtools")
# library(devtools)
# install_github('arilamstein/choroplethrZip')
# library(choroplethrZip)
# library(data.table)
#
####
#### Section one: the data set providing the zipcode we'll use to subset the crime set
####
austin2014_data_raw <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv')
names(austin2014_data_raw)
nrow(austin2014_data_raw)
## clean up: make any blank cells in column ZipCode say "NA" instead -> source: http://stackoverflow.com/questions/12763890/exclude-blank-and-na-in-r
austin2014_data_raw[austin2014_data_raw$ZipCode==""] <- NA
# keep only rows that do not have "NA"
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# selecting the first column, which is ZipCode
zipCodesOfData <- austin2014_data[,1]
View(zipCodesOfData)
# Now we have the zipcodes we need: zipCodesOfData
####
#### Section two: Crime data
####
# Crime by zipcode: https://data.austintexas.gov/dataset/Annual-Crime-2014/7g8v-xxja
# (visualized: https://data.austintexas.gov/dataset/Annual-Crime-2014/8mst-ed5t )
# https://data.austintexas.gov/resource/<insertResourceNameHere>.csv w/ resource "7g8v-xxja"
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv')
View(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# First, let's remove the data we don't need
names(austinCrime2014_data_raw)
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime2014_data_selected_columns <- subset(austinCrime2014_data_raw, select=columnSelection_Crime)
names(austinCrime2014_data_selected_columns)
nrow(austinCrime2014_data_selected_columns)
####
#### Section Three: The problem: I am unable to make subsets with the two following methods.
####
# Neither of these methods work:
# Attempt 1:
austinCrime2014_data_selected_columns <- austinCrime2014_data_selected_columns[austinCrime2014_data_selected_columns$`GO Location Zip` %in% zipCodesOfData , ]
View(austinCrime2014_data_selected_columns) # No data in the table
# Attempt 2:
# This initially told me an error:
# Then, I installed dplyr and the error went away.
library(dplyr)
# However, it still doesn't create anything-- just an empty set w/ headers
austinCrime2014_data_selected_zips <- filter(austinCrime2014_data_selected_columns, `GO Location Zip` %in% zipCodesOfData)
View(austinCrime2014_data_selected_zips)
I edited out this section, after realizing it was unnecessary.
####
#### Bad section
####
nrow(austinCrime2014_data_selected_columns)
# Then, let's keep only the zipcodes we need
# doesnt work: austinCrime2014_data_selected_columns_df <- data.frame(austinCrime2014_data_selected_columns)
# typeof(austinCrime2014_data_selected_columns_df)
austinCrime<-do.call("rbind", austinCrime2014_data_selected_columns)
austinCrime_needsTranspose <-as.data.frame(austinCrime)
austinCrime <- t(austinCrime_needsTranspose)
typeof(austinCrime)
View(austinCrime)
names(austinCrime)
####
#### Bad section
####
I think readr and dplyr can solve your problem. It's simple:
library(readr)
library(dplyr)
### SECTION 1
# Import data
austin2014_data_raw <- read_csv('https://data.austintexas.gov/resource/hcnj-rei3.csv', na = '')
glimpse(austin2014_data_raw)
nrow(austin2014_data_raw)
# Remove NAs
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# Get zip codes
zipCodesOfData <- austin2014_data$`Zip Code`
### SECTION 2
# Import data
austinCrime2014_data_raw <- read_csv('https://data.austintexas.gov/resource/7g8v-xxja.csv', na = '')
glimpse(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# Select and rename required columns
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime_df <- select(austinCrime2014_data_raw, one_of(columnSelection_Crime))
names(austinCrime_df) <- c("zipcode", "highestOffenseDesc", "NIBRS_OffenseDesc")
glimpse(austinCrime_df)
nrow(austinCrime_df)
### SECTION 3
# Filter by zipcode
austinCrime2014_data_selected_zips <- filter(austinCrime_df, zipcode %in% zipCodesOfData)
glimpse(austinCrime2014_data_selected_zips)
nrow(austinCrime2014_data_selected_zips)
Here I used read_csv() from the readr package to import data, and the subset methods select() and filter() from the dplyr package to get the required columns and rows.
I'm not sure why you're do.calling and transposing your data. You can just use something like dplyr's semi_join to get only the zipcodes you want:
library(data.table)
library(dplyr)
#> -------------------------------------------------------------------------
#> data.table + dplyr code now lives in dtplyr.
#> Please library(dtplyr)!
#> -------------------------------------------------------------------------
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
zipCodesOfData <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv') %>%
mutate(`Zip Code` = ifelse(`Zip Code` == "", NA, `Zip Code`)) %>%
na.omit() %>%
select(`Zip Code`)
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv') %>%
select(`GO Location Zip`, `GO Highest Offense Desc`, `Highest NIBRS/UCR Offense Description`) %>%
semi_join(zipCodesOfData, by = c("GO Location Zip" = "Zip Code")) %>%
rename(zipcode = `GO Location Zip`,
highestOffenseDesc = `GO Highest Offense Desc`,
NIBRS_OffenseDesc = `Highest NIBRS/UCR Offense Description`)

Resources