Simplify R code to import big data as character

Simplify R code to import big data as character - r

I am currently using the code below very often to import a big dataset into R and forcing it to treat everything as character in order to avoid the truncation of rows. The code seems to work well, but I was wondering whether any of you knows how it could be simplified or improved to so it doesn't get so repetitive each time I need to do it.
library(readr)
library(stringr)
dataset.path <- choose.files(caption = "Select dataset", multi = FALSE)
data.columns <- read_delim(dataset.path, delim = '\t', col_names = TRUE, n_max = 0)
data.coltypes <- c(rep("c", ncol(data.columns)))
data.coltypes <- str_c(data.coltypes, collapse = "")
dataset <- read_delim(dataset.path, delim = '\t', col_names = TRUE, col_types = data.coltypes)

like #Roland has suggested, you should write a function. here is one possibility:
foo <- function(){
require(readr)
dataset.path <- choose.files(caption = "Select dataset", multi = FALSE)
data.columns <- read_delim(dataset.path, delim = '\t', col_names = TRUE, n_max = 0)
data.coltypes <- paste(rep("c", ncol(data.columns)), collapse = "")
dataset <- read_delim(dataset.path, delim = '\t', col_names = TRUE, col_types = data.coltypes)
}
you can then just call foo() whenever you need to read a database in using this method.
your two liner:
data.coltypes <- c(rep("c", ncol(data.columns)))
data.coltypes <- str_c(data.coltypes, collapse = "")
can be collapsed into just one line and only using base R paste instead of str_c in the stringr package.

Related

R - list of lists from multiple csv with multiple headers with an inconsistent number and order of columns

I have a a number of csv .txt files that have an inconsistent number and order of columns added to their left and right sides. Each change though, is preceded by a row of headers. read.table and fread struggle with these additional columns but after trawling countless stackoverflow Q on the topic [there are many! e.g. defining col.name or fill = TRUE or using ncols <- max(count.fields(file, sep = ",")) did not work], I found a way to deal with these inconsistencies using the headers here: Is there a better way in R to split a file with multiple sections, which produces a list of all the headed sections in a single file, which I can then rbindlist() on the common columns I want.
My question is how I use an if() loop or a function() to apply this code to a folder with a number these .txt files and load them into a list of lists perhaps? I am a beginner so am still trying to grasp nested looping and have been pulling my hair out for weeks! Can anyone help? Thanks.
Here is example code to work with:
My data are big files of lat/long info but I have created an example dataset below.
library(data.table)
tmp1 <- c("C,D,E,F", "1,1,1,1", "2,2,2,2", "C,D,E,F", "3,3,3,3", "4,4,4,4",
"5,5,5,5", "C,D,E,F", "6,6,6,6", "7,7,7,7", "8,8,8,8", "9,9,9,9",
"A,B,C,D,E,F", "10,10,10,10,10,10", "11,11,11,11,11,11", "A,B,C,D,E,F",
"12,12,12,12,12,12", "13,13,13,13,13,13", "14,14,14,14,14,14",
"15,15,15,15,15,15", "A,B,C,D,E,F,G,H", "16,16,16,16,16,16,16,16",
"17,17,17,17,17,17,17,17", "18,18,18,18,18,18,18,18", "A,B,C,D,E,F,G,H",
"19,19,19,19,19,19,19,19", "20,20,20,20,20,20,20,20")
tmp2 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23",
"24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27",
"28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30",
"31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33",
"34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H",
"36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38",
"A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")
tmp3 <- c("C,D,E,F", "21,21,21,21", "22,22,22,22", "C,D,E,F", "23,23,23,23",
"24,24,24,24", "25,25,25,25", "C,D,E,F", "26,26,26,26", "27,27,27,27",
"28,28,28,28", "29,29,29,29", "A,B,C,D,E,F", "30,30,30,30,30,30",
"31,31,31,31,31,31", "A,B,C,D,E,F", "32,32,32,32,32,32", "33,33,33,33,33,33",
"34,34,34,34,34,34", "35,35,35,35,35,35", "A,B,C,D,E,F,G,H",
"36,36,36,36,36,36,36,36", "37,37,37,37,37,37,37,37", "38,38,38,38,38,38,38,38",
"A,B,C,D,E,F,G,H", "39,39,39,39,39,39,39,39", "40,40,40,40,40,40,40,40")
tmp4 <- c("C,D,E,F", "61,61,61,61", "62,62,62,62", "C,D,E,F", "63,63,63,63",
"64,64,64,64", "65,65,65,65", "C,D,E,F", "66,66,66,66", "67,67,67,67",
"68,68,68,68", "69,69,69,69", "A,B,C,D,E,F", "70,70,70,70,70,70",
"71,71,71,71,71,71", "A,B,C,D,E,F", "72,72,72,72,72,72", "73,73,73,73,73,73",
"74,74,74,74,74,74", "75,75,75,75,75,75", "A,B,C,D,E,F,G,H",
"76,76,76,76,76,76,76,76", "77,77,77,77,77,77,77,77", "78,78,78,78,78,78,78,78",
"A,B,C,D,E,F,G,H", "79,79,79,79,79,79,79,79", "80,80,80,80,80,80,80,80")
wd <- getwd()
dir.create("tmpfolder")
write.table(tmp1, paste(wd,"/tmpfolder/tmp1.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp2, paste(wd,"/tmpfolder/tmp2.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp3, paste(wd,"/tmpfolder/tmp3.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(tmp4, paste(wd,"/tmpfolder/tmp4.txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE) # an example of my current lack of ability to use loops!
file = file.path(paste(wd,"/tmpfolder/tmp1.txt",sep=""))
tmp = readLines(file)
sof <- (grep("C", tmp)) - 1
real_start <- sof + 1
real_end <- c(sof[-1] - 1, length(tmp))
to_read <- real_end - real_start + 1
my_dfs <- vector("list", length = length(real_start))
for(i in 1:length(my_dfs)){
my_dfs[[i]] <- suppressWarnings(
data.table::fread(file,
sep = ",",
skip = sof[i],
nrows = to_read[i],
fill = TRUE,
check.names = FALSE,
data.table = FALSE,
)
)
}

you can just build a loop around your code like this:
for(i in 1:5)
{
write.table(eval(parse(text=paste0("tmp",i))), paste(wd,"/tmpfolder/tmp",i,".txt",sep=""), sep = "", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
And the same for reading in the data
for(i in 1:5)
{
file = file.path(paste(wd,"/tmpfolder/tmp",i,".txt",sep=""))
d <- fread(file, sep = ",")
if(i==1){d.out <- list(d)}else{d.out <- list(d.out, d)}
}

How to "fread" a list of files with a different number of columns?

Usually, I am led to read lists of files which all have the same format, the same number of columns.
My function looks like :
fun.read <- function(files) {
read <- function(filename){
DT <- data.table::fread(filename, header = FALSE, sep = ";", select = 1:7, col.names = c(...))
}
lst <- lapply(files, read)
}
It works fine.
But now, I have to do the same, assuming my files doesn't have the same number of columns.
The way I do this is, for example, something like :
fun.read <- function(files) {
read <- function(filename){
if (max(count.fields(filename, sep = ";")) == 7) {
DT <- data.table::fread(filename, header = FALSE, sep = ";", select = 1:7, col.names = c(...))
} else if (max(count.fields(filename, sep = ";")) == 8){
DT <- data.table::fread(filename, header = FALSE, sep = ";", select = 1:8, col.names = c(...))
}
}
lst <- lapply(files, read)
}
It seems to work fine too, but I'm wondering if there is not a more efficient / elegant way to do this ?
I looked towards the fill = TRUE option, without success...
Many thanks !!

In case it could be helpfull to someone, here is how I lightened my script :
Including the if in the col.names()
x <- max(count.fields(filename, sep = ";"))
DT <- data.table::fread(filename, header = FALSE, sep = ";", col.names = c("...", "...", "...", if(x>3)"..."))
Thanks.

R asks for a list which seems to be a list according to is.list (=TRUE)

I am using the RAM package.
The function I use is very simple for diversity index, adding up a column in my metadata ;
outname <-OTU.diversity(data=OTUtables, meta=metatables)
(Arguments: data a list of OTU tables.
meta the metadata to append the outputs)
I am looping it but I get this error:
please provide otu tables as list; see ?RAM.input.formatting
So I go to that help menu and read this:
one data set:
data=list(data=otu)
multiple data sets:
data=list(data1=otu1, data2=otu2, data3=otu3)
here is my code:
i <- 1
for(i in 1:nrow(metadataMasterTax)){
temp <- read.table(paste(metadataMasterTax$DataAnFilePath[i], metadataMasterTax$meta[i], sep = ""),
sep = "\t", header = TRUE, dec = ".", comment.char = "", quote = "", stringsAsFactors = TRUE,
as.is = TRUE)
temp2 <- temp
temp2$row.names <- NULL #to unactivate numbers generated in the margin
trans <- read.table(paste(metadataMasterTax$taxPath[i], metadataMasterTax$taxName[i], sep = ""),
sep = "\t", header = TRUE, dec = ".", comment.char = "", quote = "", stringsAsFactors = TRUE,
as.is = TRUE, check.names = FALSE)
trans2 <- trans
trans2$row.names <- NULL #to unactivate numbers generated in the margin
data=list(data=trans2[i])
temp2[i] <- OTU.diversity(data=trans2[i], meta=temp2[i])
# Error in OTU.diversity(trans2, temp2) :
# please provide otu tables as list; see ?RAM.input.formatting
# is.list(trans2)
# [1] TRUE
# is.list(data)
# [1] TRUE
temp$taxonomy <- temp2$taxonomy
write.table(temp, file=paste(pathDataAn, "diversityDir/", metadataMasterTax$ShortName[i], ".meta.div.tsv", sep = ""),
append = FALSE,
sep = "\t",
row.names = FALSE)
}
Can anyone help me please....
thanks a lot

Because the main problem appears to be getting the OTU.diversity function to work, I focus on this issue. The code snippet below runs OTU.diversity without any problems, using the Google sheets data provided by OP.
library(gsheet)
library(RAM)
for (i in 1:2) {
# Meta data
temp <- as.data.frame(gsheet2tbl("https://drive.google.com/open?id=1hF47MbYZ1MG6RzGW-fF6tbMT3z4AxbGN5sAOxL4E8xM"))
temp$row.names <- NULL
# OTU
trans <- as.data.frame(gsheet2tbl("https://drive.google.com/open?id=1gOaEjDcs58T8v1GA-OKhnUsyRDU8Jxt2lQZuPWo6XWU"))
trans$row.names <- NULL
rownames(temp) <- colnames(trans)[-ncol(trans)]
temp2 <- OTU.diversity(data = list(data = trans), meta = temp)
write.table(temp2,
file = paste0("file", i, ".meta.div.tsv"), # replace
append = FALSE,
sep = "\t",
row.names = FALSE)
}
Replace for (i in 1:2) with for(i in 1:nrow(metadataMasterTax)), as.data.frame(gsheet2tbl(...)) with read.table(...), and the file argument in write.table with the appropriate string.

twitteR how to search for two hashtags

enter code hereIs it possible to look for two different hastags in one searchTwitter command?
Example
my_h <- as.POSIXlt(Sys.time())
my_h <- strptime(my_h, format = "%Y-%m-%d %H:%M:%S", tz = "CET")
hrs <- function(u) {
x <- u * 3600
return(x)
}
my_h <- my_h - hrs(24)
my_h <- data.frame(day = strptime(my_h, "%Y-%m-%d", tz = ""))
I want to look for hastags #dn and #park
I can do it separately as below
tweets<-twListToDF(searchTwitter("#dn", n=5000, since = as.character(my_h$day)))
write.table(tweets, "all_dn_tweets.csv", row.names = F, append = T, sep = ";", col.names = F)
tweets<-twListToDF(searchTwitter("#park", n=5000, since = as.character(my_h$day)))
write.table(tweets, "all_park_tweets.csv", row.names = F, append = T, sep = ";", col.names = F)
The question is if these two can be squeezed into one?

Try something like this
hashtags <- c("#metallica", "#slayer")
needle <- paste(hashtags, collapse = " OR ")
tweets <- searchTwitter(needle, n = 10)
df <- twListToDF(tweets)
for (hashtag in hashtags) {
write.csv(df[grep(hashtag, tolower(df$text), fixed = TRUE), ], paste0(hashtag, ".csv"))
}
Using tolower on tweets may need some error handling - you'll find plenty of infos on that on the web.

I found the answer posted here more elegant. For the present case it basically is
hashtags <- '#metallica + #slayer'
tweets <- searchTwitter(hashtags, n = 10, lang = 'en', retryOnRateLimit = 100)
tweetsDF <- twListToDF(tweets)

Can I nest apply functions in R?

I have a series of CSV files that I want to prepare to append together. My appended file will be large, so I'd like to convert some string variables to numeric and date formats in the individual files rather than the larger appended file.
With other software, I would have one for loop that opens the file and nested for loops that would iterate over certain groups of variables. For this project, I am attempting to use R and apply functions.
I have mapply and lapply functions that work independently. I'm now trying to figure out how to combine them. Can I nest them? (See below for the independent parts and the nesting.)
(This code references code in the answer to How do I update data frame variables with sapply results?)
(Is it customary to provide an example CSV to give a reproducible example? Does R have built-in example CSVs?)
These work separately:
insert.division <- function(fileroot, divisionname){
ext <- ".csv"
file <- paste(fileroot, ext, sep = "")
data <- read.csv(file, header = TRUE, stringsAsFactors = FALSE)
data$division <- divisionname
write.csv(data, file = paste(fileroot, "_adj3", ext, sep = ""),
row.names = FALSE)
}
files <- c(
"file1",
"file2",
"file3",
"file4",
"file5"
)
divisions <- c(1:5)
#Open the files, insert division name, save new versions
mapply(insert.division, fileroot = files, divisionname = divisions)
#Change currency variables from string to numeric
currency.vars <- c(
"Price",
"RetailPrice"
)
df[currency.vars] <- lapply(
df[currency.vars],
function(x) as.numeric(sub("^\\(","-", gsub("[$,]|\\)$","", x)))
)
Combined version:
file.prep <- function(fileroot, divisionname, currency.vars){
ext <- ".csv"
file <- paste(fileroot, ext, sep = "")
data <- read.csv(file, header = TRUE, stringsAsFactors = FALSE)
data$division <- divisionname
df[currency.vars] <- lapply(
df[currency.vars],
function(x) as.numeric(sub("^\\(","-", gsub("[$,]|\\)$","", x)))
)
write.csv(data, file = paste(fileroot, "_adj", ext, sep = ""),
row.names = FALSE)
}
#Open the files, insert division name, change the currency variables,
#save new versions
mapply(file.prep, fileroot = files, divisionname = divisions,
currency.vars = df[currency.vars])

I'm not really sure why you're writing it back to file after changing the data, but here's an example of how I might approach the problem.
## Set up three csv files
set.seed(1)
DF <- data.frame(
w = paste0("($", sample(1500, 30) / 100, ")"),
x = Sys.Date() + 0:29,
y = sample(letters, 30, TRUE),
z = paste0("($", sample(1500, 30) / 100, ")")
)
fnames <- paste0("file", 1:3, ".csv")
Map(write.csv, split(DF, c(1, 10, 20)), fnames, row.names = FALSE)
Using your file.prep() function, you could adjust it a little and do
file.prep <- function(fileroot, divname, vars) {
ext <- ".csv"
file <- paste0(fileroot, ext)
data <- read.csv(file, stringsAsFactors = FALSE)
data$division <- divname
data[vars] <- lapply(data[vars], function(x) {
type.convert(gsub("[()$]", "", x))
})
write.csv(data, row.names = FALSE, file = paste0(fileroot, "_adj", ext))
}
divname <- 1:3
fnames <- paste0("file", divname)
Map(file.prep, fnames, divname, MoreArgs = list(vars = c("w", "z")))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Simplify R code to import big data as character - r

Related

R - list of lists from multiple csv with multiple headers with an inconsistent number and order of columns

How to "fread" a list of files with a different number of columns?

R asks for a list which seems to be a list according to is.list (=TRUE)

twitteR how to search for two hashtags

Can I nest apply functions in R?

Categories

Resources