Read csv file from S3 into spark in R

Read csv file from S3 into spark in R - r

I've below code to read csv from s3 into spark
test_data <- spark_read_csv(
sc,
name = "Invites",
memory = FALSE,
path = "s3://xxxx/customer/Sample.csv")
csvcharobj <- rawToChar(test_data)
con <- textConnection(csvcharobj)
data <- read.csv(file = con)
But code is failing with below error
> csvcharobj <- rawToChar(test_data)
Error in rawToChar(test_data) : argument 'x' must be a raw vector

I have changed the code as below and it did work
test_data <- spark_read_csv(
sc,
name = "Invites",
memory = FALSE,
path = "s3://xxxx/customer/Sample.csv")
test <- as.data.table(test_data)
cols_to_mask <- c("EmailAddress")
anonymize <- function(x, algo="crc32") {
sapply(x, function(y) if(y == "" | is.na(y)) "" else digest(y, algo = algo))
}
setDT(test)
test[, (cols_to_mask) := lapply(.SD, anonymize), .SDcols = cols_to_mask]
print(test)

Related

How do I use write.csv in a for loop to use the original file name and a suffix to save my results to file?

I am using a loop to run through a folder of files and extract datapoints. The main part of my code works, but i am havign issues getting it to save. I keep getting this Error:
Error in file(file, ifelse(append, "a", "w")) : invalid 'open' argument
Can you help me please? My original filenames are in the format "SpeciesName_loc.csv", I would like to save the results as "SpeciesName_tavg.csv" or "SpeciesName_prec.csv".
Here is the code I tried most recently. Nothign has worked. Some codes have saved but with NA_prec.csv or NA_tavg.csv, but not with the proper species name.
PREC <- "D:/RED LAPTOP BACKUP/DRENNAN R/PALEOCLIMATE_PROJECT/PREC_2023"
TEMP <- "D:/RED LAPTOP BACKUP/DRENNAN R/PALEOCLIMATE_PROJECT/TEMP_2023"
FILES <- "D:/RED LAPTOP BACKUP/DRENNAN R/PALEOCLIMATE_PROJECT/SPECIES_LOC_2023"
psuf <- "_prec"
tsuf <- "_tavg"
species <-list.files(FILES, pattern = '.csv', full.names = TRUE)
for (i in species){
media <- read.csv(i)
clim1 <- media$lon
clim2 <- media$lat
clim1lon <- c(clim1)
clim2lat <- c(clim2)
df <- data.frame(x= clim1lon, y= clim2lat)
prec.data <- df
prec.data$Jan <- extract(prec1, df)
prec.data$Feb <- extract(prec2, df)
prec.data$Mar <- extract(prec3, df)
prec.data$Apr <- extract(prec4, df)
prec.data$May <- extract(prec5, df)
prec.data$Jun <- extract(prec6, df)
prec.data$Jul <- extract(prec7, df)
prec.data$Aug <- extract(prec8, df)
prec.data$Sep <- extract(prec9, df)
prec.data$Oct <- extract(prec10, df)
prec.data$Nov <- extract(prec11, df)
prec.data$Dec <- extract(prec12, df)
write.csv(prec.data, path = "PREC", gsub('loc.csv', 'prec.csv', i), row.names = FALSE)
temp.data <- df
temp.data$Jan <- extract(temp1, df)
temp.data$Feb <- extract(temp2, df)
temp.data$Mar <- extract(temp3, df)
temp.data$Apr <- extract(temp4, df)
temp.data$May <- extract(temp5, df)
temp.data$Jun <- extract(temp6, df)
temp.data$Jul <- extract(temp7, df)
temp.data$Aug <- extract(temp8, df)
temp.data$Sep <- extract(temp9, df)
temp.data$Oct <- extract(temp10, df)
temp.data$Nov <- extract(temp11, df)
temp.data$Dec <- extract(temp12, df)
write.csv(temp.data, path = "TEMP", gsub('loc.csv', 'tavg.csv', i), row.names = FALSE)
}

Your second and third arguments likely need to be combined into a path. The call fails because of a few things:
write.csv takes file=, not path=;
it cannot write to a directory, so file="TEMP" (inferring that "TEMP" is a local directory) cannot work; and
your third (unnamed) argument is being applied to the append= argument, which should be logical.
Instead, try
write.csv(temp.data, file = file.path("TEMP", gsub('loc.csv', 'tavg.csv', i)), row.names = FALSE)
From ?write.csv, the args (and first few descriptions) are:
Usage:
write.table(x, file = "", append = FALSE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
write.csv(...)
write.csv2(...)
Arguments:
x: the object to be written, preferably a matrix or data frame.
If not, it is attempted to coerce 'x' to a data frame.
file: either a character string naming a file or a connection open
for writing. '""' indicates output to the console.
append: logical. Only relevant if 'file' is a character string. If
'TRUE', the output is appended to the file. If 'FALSE', any
existing file of the name is destroyed.

Why can I not print out dataframe from Excel in R

Trying to print out dataframe that is created after importing Excel file into R using following code:
library("readxl")
data <- read_excel("grad programs.xlsx", sheet="Sheet2")
print(data)
But instead of getting the Excel file, I get this really long random message:
print(data)
function (..., list = character(), package = NULL, lib.loc = NULL,
verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
{
fileExt <- function(x) {
db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
ans <- sub(".*\\.", "", x)
ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
x[db])
ans
}
my_read_table <- function(...) {
lcc <- Sys.getlocale("LC_COLLATE")
on.exit(Sys.setlocale("LC_COLLATE", lcc))
Sys.setlocale("LC_COLLATE", "C")
read.table(...)
}
stopifnot(is.character(list))
names <- c(as.character(substitute(list(...))[-1L]), list)
if (!is.null(package)) {
if (!is.character(package))
stop("'package' must be a character vector or NULL")
}
paths <- find.package(package, lib.loc, verbose = verbose)
if (is.null(lib.loc))
paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
paths)
paths <- unique(normalizePath(paths[file.exists(paths)]))
paths <- paths[dir.exists(file.path(paths, "data"))]
dataExts <- tools:::.make_file_exts("data")
if (length(names) == 0L) {
db <- matrix(character(), nrow = 0L, ncol = 4L)
for (path in paths) {
entries <- NULL
packageName <- if (file_test("-f", file.path(path,
"DESCRIPTION")))
basename(path)
else "."
Message is longer than that, but that's the start - any idea why get this message rather than the actual data in the Excel sheet

generate variable names in for loop

Hope you don't mind if this is too easy for you.
In R, I am using fromJSON() to read from 3 urls (tier 1 url) , in the JSON file there is "link" field which give me another url (tier 2 url) and I use that and read.table() to get my final data. My code now is like this:
# note, this code does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
tempJohn <- fromJson(urlJohn)
tempJohn[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJohn <- read.table(tempJohn[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJane <- fromJson(urlJane)
tempJane[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJane <- read.table(tempJane[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJoe <- fromJson(urlJoe)
tempJoe[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJoe <- read.table(tempJoe[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
As you can see, I am just copying-n-pasting code blocks. What I wish is this:
# note, this code also does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
source <- c("John", "Jane", "joe")
for (i in source){
temp <- paste(temp, i, sep = "")
url <- paste(url, i, sep = "")
data <- paste(data, i, sep = "")
temp <- fromJson(url)
temp[["data"]][["rows"]]$link %<>%
{clean up this data}
data <- read.table(temp[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
}
What do I need to do to make the for loop work? If my question is not clear, please ask me to clarify it.

I usually find using lapply convenient than a for loop. Although you can easily convert this to a for loop if needed.
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
temp[["data"]][["rows"]]$link %<>% {clean up this data}
read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
}) -> list_data
list_data

Thanks to #Ronak Shah. The R community strongly favors "non-For-loop" solution.
The way to get my desired result is lapply.
Below is non-running codes in mnemonics:
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
x <- temp[["data"]][["rows"]]$link %<>% {clean up this data}
y <- read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
return(list(x, y))
})
And this is a running example.
x <- list(alpha = 1:10,
beta = exp(-3:3),
logic = c(TRUE,FALSE,FALSE,TRUE))
lapply(x, function(x){
temp <- sum(x) / 2
temp2 <- list(x,
temp)
return(temp2)
}
)

How to batch process geoTIFFs in R with lapply

I have some large geoTIFFs, now I want to convert them to ASCII files, after doing some searches, I write these codes:
library(raster)
f <- list.files("inputFolder", pattern = "*.tif", full.names = TRUE)
r <- lapply(f, raster)
a <- lapply(r, writeRaster, filename = "output", format = "ascii")
What confused me is that how can I name the output files respectively, according to its original names?
I tried:
a <- lapply(r, writeRaster, filename = "outputFolder" + f, format = "ascii")
But I received error:
non-numeric argument to binary operator
Then I tried:
a <- lapply(r, writeRaster, filename = paste0(f, ".asc"), format = "ascii")
But I received:
Error in file(filename, "w") : invalid 'description' argument In
addition: Warning messages: 1: In if (filename == "") { : the
condition has length > 1 and only the first element will be used 2: In
if (!file.exists(dirname(filename))) { : the condition has length >
1 and only the first element will be used 3: In if
(toupper(x#file#name) == toupper(filename)) { : the condition has
length > 1 and only the first element will be used 4: In if
(trim(filename) == "") { : the condition has length > 1 and only the
first element will be used 5: In if (!file.exists(dirname(filename)))
{ : the condition has length > 1 and only the first element will be
used 6: In if (filename == "") { : the condition has length > 1 and
only the first element will be used 7: In if (!overwrite &
file.exists(filename)) { : the condition has length > 1 and only the
first element will be used

I think you were basically nearly there, with two corrections:
First, you're calling writeRaster for its side effects (i.e. its ability to write a file to your filesystem) so you don't need to assign the output of your lapply() loop to an object. So, removing a <- we have:
lapply(r, writeRaster, filename = paste0(f, ".asc"), format = "ascii")
Next, the filename argument won't loop through f in this way. You have two options, of which the simplest is probably to pass the #file#name slot of r to the filename argument using an anonymous function:
lapply(r, function(x) {
writeRaster(x, filename = x#file#name, format = "ascii", overwrite = TRUE)
})
Your other option would be to loop through r and f in parallel like you can in python with for r, f in..., which can be done with purrr:
library("purrr")
walk2(r, f, function(x, y) {
writeRaster(x = x, filename = y, format = "ascii")
})
Here we're using walk2() rather than map2() because we need to call the function for side effects. This loops through r and f together so you can pass one to be the object to write, and one to be the filename.
Edit: here's the code I use to reproduce the problem
library("raster")
tmp_dir = tempdir()
tmp = tempfile(tmpdir = tmp_dir, fileext = ".zip")
download.file(
"http://biogeo.ucdavis.edu/data/climate/cmip5/10m/cc26bi50.zip",
destfile = tmp
)
unzip(tmp, exdir = tmp_dir)
f = list.files(tmp_dir, pattern = ".tif$", full.names = TRUE)
r = lapply(f, raster)
# Solution one
lapply(r, function(x) {
writeRaster(x, filename = x#file#name, format = "ascii", overwrite = TRUE)
})
# solution two
library("purrr")
walk2(r, f, function(x, y) {
writeRaster(x = x, filename = y, format = "ascii")
})

To test how to do this with small files:
library(raster)
s <- stack(system.file("external/rlogo.grd", package="raster"))
writeRaster(s, file='testtif', format='GTiff', bylayer=T, overwrite=T)
f <- list.files(pattern="testtif_..tif")
Now you can use f with Phil's nice examples. You can also combine all in one step lapply:
f <- list.files("inputFolder", pattern = "*.tif", full.names = TRUE)
r <- lapply(f, function(i) { writeRaster(raster(i), filename=extension(i, '.asc'), overwrite=TRUE)} )
But if you have trouble with lapply, write a loop (it is fine!):
for (i in 1:length(f)) {
r <- raster(f[i])
ff <- extension(f[i], '.asc')
writeRaster(r, ff)
}
Or like this
for (file in f) {
r <- raster(file)
ff <- extension(file, '.asc')
writeRaster(r, ff)
}

How do I fix the “No encoding Supplied” error?

I am facing difficulties after running the code and trying to export the dataset to a spreadsheet or txt.file.
I am newbie to R, so maybe this question is trivial.
After running the following code:
eia_series <- function(api_key, series_id, start = NULL, end = NULL, num = NULL, tidy_data = "no", only_data = FALSE){
# max 100 series
# test if num is not null and either start or end is nut null. Not allowed
# api_key test for character.
# series_id test for character.
# if start/end not null, then check if format matches series id date format
# parse date and numerical data
# parse url
series_url <- httr::parse_url("http://api.eia.gov/series/")
series_url$query$series_id <- paste(series_id, collapse = ";")
series_url$query$api_key <- api_key
series_url$query$start <- start
series_url$query$end <- end
series_url$query$num <- num
# get data
series_data <- httr::GET(url = series_url)
series_data <- httr::content(series_data, as = "text")
series_data <- jsonlite::fromJSON(series_data)
# Move data from data.frame with nested list and NULL excisting
series_data$data <- series_data$series$data
series_data$series$data <- NULL
# parse data
series_data$data <- lapply(X = series_data$data,
FUN = function(x) data.frame(date = x[, 1],
value = as.numeric(x[, 2]),
stringsAsFactors = FALSE))
# add names to the list with data
names(series_data$data) <- series_data$data
# parse dates
series_data$data <- eia_date_parse(series_list = series_data$data, format_character = series_data$series$f)
# tidy up data
if(tidy_data == "tidy_long"){
series_data$data <- lapply(seq_along(series_data$data),
function(x) {cbind(series_data$data[[x]],
series_time_frame = series_data$series$f[x],
series_name = series_data$series$series_id[x],
stringsAsFactors = FALSE)})
series_data$data <- do.call(rbind, series_data$data)
}
# only data
if(only_data){
series_data <- series_data$data
}
return(series_data)
}
After running the function
eia_series(api_key = "XXX",series_id = c("PET.MCRFPOK1.M", "PET.MCRFPOK2.M"))
I tried to "transfer" the data in order to export it but got the following error:
No encoding supplied: defaulting to UTF-8.
I don't understand why. Could you help me out?

That doesn't look like an error, rather a statement. Probably coming from httr::content(series_data, as = "text"). Look in https://cran.r-project.org/web/packages/httr/vignettes/quickstart.html in The body section. It shouldn't be a problem, as long as your data returns what you expect. Otherwise you can try different encoding or there is a bug elsewhere.

Try:
series_data <- httr::content(series_data, as = "text", encoding = "UTF-8")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Read csv file from S3 into spark in R - r

Related

How do I use write.csv in a for loop to use the original file name and a suffix to save my results to file?

Why can I not print out dataframe from Excel in R

generate variable names in for loop

How to batch process geoTIFFs in R with lapply

How do I fix the “No encoding Supplied” error?

Categories

Resources