Skip empty files when importing text files - r

I have a folder with about 700 text files that I want to import and add a column to. I've figured out how to do this using the following code:
files = list.files(pattern = "*c.txt")
DF <- NULL
for (f in files) {
data <- read.table(f, header = F, sep=",")
data$species <- strsplit(f, split = "c.txt") <-- (column name is filename)
DF <- rbind(DF, data)
}
write.xlsx(DF,"B:/trends.xlsx")
Problem is, there are about 100 files that are empty. so the code stops at the first empty file and I get this error message:
Error in read.table(f, header = F, sep = ",") :
no lines available in input
Is there a way to skip over these empty files?

You can skip empty files by checking that file.size(some_file) > 0:
files <- list.files("~/tmp/tmpdir", pattern = "*.csv")
##
df_list <- lapply(files, function(x) {
if (!file.size(x) == 0) {
read.csv(x)
}
})
##
R> dim(do.call("rbind", df_list))
#[1] 50 2
This skips over the 10 files that are empty, and reads in the other 10 that are not.
Data:
for (i in 1:10) {
df <- data.frame(x = 1:5, y = 6:10)
write.csv(df, sprintf("~/tmp/tmpdir/file%i.csv", i), row.names = FALSE)
## empty file
system(sprintf("touch ~/tmp/tmpdir/emptyfile%i.csv", i))
}

For a different approach that introduces explicit error handling, think about a tryCatch to handle anything else bad that might happen in your read.table.
for (f in files) {
data <- tryCatch({
if (file.size(f) > 0){
read.table(f, header = F, sep=",")
}
}, error = function(err) {
# error handler picks up where error was generated
print(paste("Read.table didn't work!: ",err))
})
data$species <- strsplit(f, split = "c.txt")
DF <- rbind(DF, data)
}

Related

Why can I not print out dataframe from Excel in R

Trying to print out dataframe that is created after importing Excel file into R using following code:
library("readxl")
data <- read_excel("grad programs.xlsx", sheet="Sheet2")
print(data)
But instead of getting the Excel file, I get this really long random message:
print(data)
function (..., list = character(), package = NULL, lib.loc = NULL,
verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
{
fileExt <- function(x) {
db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
ans <- sub(".*\\.", "", x)
ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
x[db])
ans
}
my_read_table <- function(...) {
lcc <- Sys.getlocale("LC_COLLATE")
on.exit(Sys.setlocale("LC_COLLATE", lcc))
Sys.setlocale("LC_COLLATE", "C")
read.table(...)
}
stopifnot(is.character(list))
names <- c(as.character(substitute(list(...))[-1L]), list)
if (!is.null(package)) {
if (!is.character(package))
stop("'package' must be a character vector or NULL")
}
paths <- find.package(package, lib.loc, verbose = verbose)
if (is.null(lib.loc))
paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
paths)
paths <- unique(normalizePath(paths[file.exists(paths)]))
paths <- paths[dir.exists(file.path(paths, "data"))]
dataExts <- tools:::.make_file_exts("data")
if (length(names) == 0L) {
db <- matrix(character(), nrow = 0L, ncol = 4L)
for (path in paths) {
entries <- NULL
packageName <- if (file_test("-f", file.path(path,
"DESCRIPTION")))
basename(path)
else "."
Message is longer than that, but that's the start - any idea why get this message rather than the actual data in the Excel sheet

adding to lists together using cbind

This program works because I made the varibles inisde lapply global by using the <<- operator. However, it does not work with the real files in the real program. These are .tsv files whith named columns. The answer I get when I run the real program is: Error: (converted from warning) Error in : (converted from warning) Error in : arguments imply differing number of rows: 3455, 4319. What might be causing this?
lc <- list("test.txt", "test.txt", "test.txt", "test.txt")
lc1 <- list("test.txt", "test.txt", "test.txt")
lc2 <- list("test.txt", "test.txt")
#list of lists. The lists contain file names
lc <- list(lc, lc1, lc2)
#new names for the three lists in the list of lists
new_dataFns <- list("name1", "name2", "name3")
file_paths <- NULL
new_path <- NULL
#add the file names to the path and read and merge the contents of each list in the list of lists
lapply(
lc,
function(lc) {
filenames <- file.path(getwd(), lc)
dataList <<- lapply(filenames, function (lc) read.table(file=lc, header=TRUE))
dataList <<- lapply(dataList, function(dataList) {merge(as.data.frame(dataList),as.data.frame(dataList))})
}
)
#add the new name of the file to the path total will be 3 paths/fille_newname.tsv.
lapply(new_dataFns, function(new_dataFns) {new_path <<- file.path(getwd(), new_dataFns)})
print(new_path)
print(dataList)
finalFiles <- merge(as.data.frame(dataList), as.data.frame(new_path))
print(finalFiles)
I found a solution to the problem by writing a different type of code. Please see below. The input to the function is provided by the app input widgets
glyCount1 <- function(answer = NULL, fileChoice = NULL, combination = NULL, enteredValue = NULL, nameList) {
lc = nameList
new_dataFns <- gsub(" ", "", nameList)
first_path <- NULL
new_path <- NULL
old_path <- NULL
file_content <- NULL
for(i in 1:length(lc)){
for(j in 1:length(lc[[i]])){
if(!is.null(lc[[i]])){
first_path[[j]]<- paste(getwd(), "/", lc[[i]][j], sep = "")
tryCatch(file_content[[j]] <- read.csv(file = first_path[[i]], header = TRUE, sep = ","), error = function(e) NULL)
old_path[[j]] <- paste(getwd(), "/", i, ".csv", sep = "")
write.table(file_content[[j]], file = old_path[[j]], append = TRUE, col.names = FALSE)
}
}
}
}

Inconclusive Error Message with download.file

I tried to download several thousand SEC files via the command:
download.file(link, folder, method = "internal", quiet = FALSE,
mode = "wb", cacheOK = TRUE,
extra = getOption("download.file.extra"))
After a while I get the following message that I cannot interpret:
https://dl.dropboxusercontent.com/u/4149177/Capture.PNG
It seems that the files are downloaded successfully, however I want to know what the message means.
Can you tell me what R tries to tell me?
Full code:
setInternet2(use = FALSE)
destinationfolder <- getwd()
startyear <- 2000
stopyear <- 2000
startquarter <- 1
stopquarter <- 2
filetype <- "10-Q"
func.getsecindexfile<- function(year, quarter) {
#### download the zipped index file from the SEC website
tf <- tempfile()
result <- try(download.file(url=paste("http://www.sec.gov/Archives/edgar/full-index/", year,"/QTR", quarter, "/company.zip",sep=""), destfile=tf))
#### if we didn't encounter and error downloading the file, parse it and return as a R data frame
if (!inherits(result, "try-error")) {
#### small function to remove leading and trailing spaces
trim <- function (string) {
string <- enc2native(string)
gsub("^\\s*(.*?)\\s*$","\\1", string, perl=TRUE)
}
#### read the downloaded file
raw.data <- readLines(con=(zz<- unz(description=tf, filename="company.idx")))
close(zz)
#### remove the first 10 rows
raw.data <- raw.data[11:length(raw.data)]
#### parse the downloaded file and return the extracted data as a data frame
company_name <- trim(substr(raw.data,1,62))
form_type <- trim(substr(raw.data,63,74))
cik <- trim(substr(raw.data,75,86))
date_filed <- as.Date(substr(raw.data,87,98))
file_name <- trim(substr(raw.data,99,150))
rm(raw.data)
return(data.frame(company_name, form_type, cik, date_filed, file_name))
}
else {return(NULL)}
}
#### add index files to database
func.addindexfiletodatabase <- function(data){
if (is.null(data)) return(NULL)
rs <- dbWriteTable(sqlite, "filings", data, append=TRUE)
return(rs)
}
dbGetQuery(sqlite, "DROP TABLE IF EXISTS filings")
for (year in startyear:stopyear){
for (quarter in startquarter:stopquarter){
func.addindexfiletodatabase(func.getsecindexfile(year, quarter))
}
}
selection <- paste("SELECT * FROM filings WHERE form_type IN ('", filetype, "')", sep = "")
index <- dbGetQuery(sqlite, selection)
pre <- c("ftp://ftp.sec.gov/")
index <- cbind(index,pre)
temp <- paste(index$pre, index$file_name, sep = "")
index <- cbind(index,temp)
index$name_new <- index$temp
index$name_new <- gsub("ftp://ftp.sec.gov/edgar/data/","",index$name_new)
index$name_new <- gsub("/","-",index$name_new)
name <- paste(index$name_new)
link <- paste(index$temp, sep = "")
index$pre <- NULL
index$temp <- NULL
#### define download function
func.download_files <- function(link,name) {
folder <- paste(destinationfolder, "\\", name, sep="")
download.file(link, folder, method="internal", quiet = FALSE, mode = "wb", cacheOK = TRUE, extra = getOption("download.file.extra"))
}
#### download the files
mapply(FUN = func.download_files,link=link,name=name)
The "error" was a notification that the files was successfully downloaded. Thank your for your help.

How to read csv inside a folder in R?

I am working in a directory, but the data I want to read is in a subdirectory. I get an error when I try to read the csv files, my code is the following:
setwd("~/Documents/")
files <- list.files(path = "data/")
f <- list()
for (i in 1:length(files)) {
f[[i]] <- read.csv(files[i], header = T, sep = ";")
}
And the error I get is:
Error in file(file, "rt"): cannot open the connection
What am I doing wrong?
The following will work, assuming you have correctly specified the other read.csv parameters.
setwd("~/Documents/")
files <- list.files(path = "data/")
f <- list()
for (i in 1:length(files)) {
f[[i]] <- read.csv(paste0("data/",files[i]), header = T, sep = ";")
}
Alternatively, you could drop the paste0 and simply set your working directory to ~/Documents/data/ in the first place.
setwd("~/Documents/data/")
files <- list.files() #No parameter necessary now since you're in the proper directory
f <- list()
for (i in 1:length(files)) {
f[[i]] <- read.csv(files[i], header = T, sep = ";")
}
If you need to be in ~/Documents/ at the end of this loop, then finish it up by adding the following after the loop.
setwd("~/Documents/")

Cross Product of specific file and rest of the folder -- R

I have a problems making R read a set of files in a folder and returning cross product of them.
I have a folder which contains one test.csv file and n train.csv files.
I need a loop to read though on folder and return a file that contain the cross product of test and each of the train files… so the rows of file should look like this.
test*train01
test*train02
test*train03
...
I wrote a script to make that for two defined line but don’t know how to adapt that for the whole folder and the pattern that I need.
data01 <- as.matrix(read.csv(file = "test.csv", sep = ",", header=FALSE))
data02 <- as.matrix(read.csv(file = "train.csv", sep = ",", header=FALSE))
test <- list()
test01<- list()
test02<- list()
i<- 1
while (i <= 25){
test01[[i]] <- c(data01[i, ])
test02[[i]] <- c(data02[i, ])
test[[i]]<- crossprod(test01[[i]],test02[[i]])
i <- i+1
}
write.csv(test, file="testing.csv", row.names = FALSE)
Try:
test <- function(data) {
data01 <- as.matrix(read.csv(file = "test.csv", sep = ",", header=FALSE))
data02 <- as.matrix(read.csv(file = data, sep = ",", header=FALSE))
test <- list()
test01<- list()
test02<- list()
i<- 1
while (i <= 25){
test01[[i]] <- c(data01[i, ])
test02[[i]] <- c(data02[i, ])
test[[i]]<- crossprod(test01[[i]],test02[[i]])
i <- i+1
}
return(test)
}
result <- lapply(list.files(pattern='Train.*'),test)
Then just loop result to save in CSV file.
EDIT: How to save:
files <- list.files(pattern='Train.*')
for (i in seq(length(result))) {
write.csv(result[[i]], paste0('result_',files[i]), row.names = FALSE)
}
EDIT: Saving in one file:
write.csv(do.call(rbind,result),'result.csv', row.names = FALSE) # Appending by row
or
write.csv(do.call(cbind,result),'result.csv', row.names = FALSE) # Appending by column

Resources