rbind txt files from online directory (R) - r

I am trying to get concatenate text files from url but i don't know how to do this with the html and the different folders?
This is the code i tried, but it only lists the text files and has a lot of html code like this How do I fix this so that I can combine the text files into one csv file?
library(RCurl)
url <- "http://weather.ggy.uga.edu/data/daily/"
dir <- getURL(url, dirlistonly = T)
filenames <- unlist(strsplit(dir,"\n")) #split into filenames
#append the files one after another
for (i in 1:length(filenames)) {
file <- past(url,filenames[i],delim='') #concatenate for urly
if (i==1){
cp <- read_delim(file, header=F, delim=',')
}
else{
temp <- read_delim(file,header=F,delim=',')
cp <- rbind(cp,temp) #append to existing file
rm(temp)# remove the temporary file
}
}

here is a code snippet that I got to work for me. I like to use rvest over RCurl, just because that's what I've learned. In this case, I was able to use the html_nodes function to isolate each file ending in .txt. The result table has the times saved as character strings, but you could fix that later. Let me know if you have any questions.
library(rvest)
library(readr)
url <- "http://weather.ggy.uga.edu/data/daily/"
doc <- xml2::read_html(url)
text <- rvest::html_text(rvest::html_nodes(doc, "tr td a:contains('.txt')"))
# define column types of fwf data ("c" = character, "n" = number)
ctypes <- paste0("c", paste0(rep("n",11), collapse = ""))
data <- data.frame()
for (i in 1:2){
file <- paste0(url, text[1])
date <- as.Date(read_lines(file, n_max = 1), "%m/%d/%y")
# Read file to determine widths
columns <- fwf_empty(file, skip = 3)
# Manually expand `solar` column to be 3 spaces wider
columns$begin[8] <- columns$begin[8] - 3
data <- rbind(data, cbind(date,read_fwf(file, columns,
skip = 3, col_types = ctypes)))
}

Related

Combine csv files with common file identifier

I have a list of approximately 500 csv files each with a filename that consists of a six-digit number followed by a year (ex. 123456_2015.csv). I would like to append all files together that have the same six-digit number. I tried to implement the code suggested in this question:
Import and rbind multiple csv files with common name in R but I want the appended data to be saved as new csv files in the same directory as the original files are currently saved. I have also tried to implement the below code however the csv files produced from this contain no data.
rm(list=ls())
filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test")
NAPS_ID <- gsub('.+?\\([0-9]{5,6}?)\\_.+?$', '\\1', filenames)
Unique_NAPS_ID <- unique(NAPS_ID)
n <- length(Unique_NAPS_ID)
for(j in 1:n){
curr_NAPS_ID <- as.character(Unique_NAPS_ID[j])
NAPS_ID_pattern <- paste(".+?\\_(", curr_NAPS_ID,"+?)\\_.+?$", sep = "" )
NAPS_filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test", pattern = NAPS_ID_pattern)
write.csv(do.call("rbind", lapply(NAPS_filenames, read.csv, header = TRUE)),file = paste("C:/Users/smithma/Desktop/PM25_test/MERGED", "MERGED_", Unique_NAPS_ID[j], ".csv", sep = ""), row.names=FALSE)
}
Any help would be greatly appreciated.
Because you're not doing any data manipulation, you don't need to treat the files like tabular data. You only need to copy the file contents.
filenames <- list.files("C:/Users/smithma/Desktop/PM25_test", full.names = TRUE)
NAPS_ID <- substr(basename(filenames), 1, 6)
Unique_NAPS_ID <- unique(NAPS_ID)
for(curr_NAPS_ID in Unique_NAPS_ID){
NAPS_filenames <- filenames[startsWith(basename(filenames), curr_NAPS_ID)]
output_file <- paste0(
"C:/Users/nwerth/Desktop/PM25_test/MERGED_", curr_NAPS_ID, ".csv"
)
for (fname in NAPS_filenames) {
line_text <- readLines(fname)
# Write the header from the first file
if (fname == NAPS_filenames[1]) {
cat(line_text[1], '\n', sep = '', file = output_file)
}
# Append every line in the file except the header
line_text <- line_text[-1]
cat(line_text, file = output_file, sep = '\n', append = TRUE)
}
}
My changes:
list.files(..., full.names = TRUE) is usually the best way to go.
Because the digits appear at the start of the filenames, I suggest substr. It's easier to get an idea of what's going on when skimming the code.
Instead of looping over the indices of a vector, loop over the values. It's more succinct and less likely to cause problems if the vector's empty.
startsWith and endsWith are relatively new functions, and they're great.
You only care about copying lines, so just use readLines to get them in and cat to get them out.
You might consider something like this:
##will take the first 6 characters of each file name
six.digit.filenames <- substr(filenames, 1,6)
path <- "C:/Users/smithma/Desktop/PM25_test/"
unique.numbers <- unique(six.digit.filenames)
for(j in unique.numbers){
sub <- filenames[which(substr(filenames,1,6) == j)]
data.for.output <- c()
for(file in sub){
##now do your stuff with these files including read them in
data <- read.csv(paste0(path,file))
data.for.output <- rbind(data.for.output,data)
}
write.csv(data.for.output,paste0(path,j, '.csv'), row.names = F)
}

Reading data from text file and combining it with date in r

I downloaded data from the internet. I wanted to extract the data and create a data frame. You can find the data in the following filtered data set link: http://www.esrl.noaa.gov/gmd/dv/data/index.php?category=Ozone&type=Balloon . At the bottom of the site page from the 9 filtered data sets you can choose any station. Say Suva, Fiji (SUV):
I have written the following code to create a data frame that has Launch date as part of the data frame for each file.
setwd("C:/Users/")
path = "~C:/Users/"
files <- lapply(list.files(pattern = '\\.l100'), readLines)
test.sample<-do.call(rbind, lapply(files, function(lines){
data.frame(datetime = as.POSIXct(sub('^.*Launch Date : ', '', lines[grep('Launch Date :', lines)])),
# and the data, read in as text
read.table(text = lines[(grep('Sonde Total', lines) + 1):length(lines)]))
}))
The files are from FTP server. The pattern of the file doesn't look familiar to me even though I tried it with .txt, it didn't work. Can you please tweak the above code or any other code to get a data frame.
Thank you in advance.
I think the problem is that the search string does not match "Launch Date :" does not match what is in the files (at least the one I checked).
This should work
lines <- "Launch Date : 11 June 1991"
lubridate::dmy(sub('^.*Launch Date.*: ', '', lines[grep('Launch Date', lines)]))
Code would probably be easier to debug if you broke the problem down into steps rather than as one sentence
I took the following approach:
td <- tempdir()
setwd(td)
ftp <- 'ftp://ftp.cmdl.noaa.gov/ozwv/Ozonesonde/Suva,%20Fiji/100%20Meter%20Average%20Files/'
files <- RCurl::getURL(ftp, dirlistonly = T)
files <- strsplit(files, "\n")
files <- unlist(files)
dat <- list()
for (i in 1:length(files)) {
download.file(paste0(ftp, files[i]), 'data.txt')
df <- read.delim('data.txt', sep = "", skip = 17)
ld <- as.character(read.delim('data.txt')[9, ])
ld <- strsplit(ld, ":")[[1]][2]
df$launch.date <- stringr::str_trim(ld)
dat[[i]] <- df ; rm(df)
}

Extracting file numbers from file names in r and looping through files

I have a folder full of .txt files that I want to loop through and compress into one data frame, but each .txt file is data for one subject and there are no columns in the text files that indicate subject number or time point in the study (e.g. 1-5). I need to add a line or two of code into my loop that looks for strings of four numbers (i.e. each file is labeled something like: "4325.5_ERN_No_Startle") and just creates a column with 4325 and another column with 5 that will appear for every data point for that subject until the loop gets to the next one. I have been looking for awhile but am still coming up empty, any suggestions?
I also have not quite gotten the loop to work:
path = "/Users/me/Desktop/Event Codes/ERN task/ERN text files transferred"
out.file <- ""
file <- ""
file.names <- dir(path, pattern =".txt")
for(i in 1:length(file.names)){
file <- read.table(file.names[i],header=FALSE, fill = TRUE)
out.file <- rbind(out.file, file)
}
which runs okay until I get this error message part way through:
Error in read.table(file.names[i], header = FALSE, fill = TRUE) :
no lines available in input
Consider using regex to parse the file name for study period and subject, both of which are then binded in a lapply of list.files:
path = "path/to/text/files"
# ANY TXT FILE WITH PATTERN OF 4 DIGITS FOLLOWED BY A PERIOD AND ONE DIGIT
file.names <- list.files(path, pattern="*[0-9]{4}\\.[0-9]{1}.*txt", full.names=TRUE)
# IMPORT ALL FILES INTO A LIST OF DATAFRAMES AND BINDS THE REGEX EXTRACTS
dfList <- lapply(file.names, function(x) {
if (file.exists(x)) {
data.frame(period=regmatches(x, gregexpr('[0-9]{4}', x))[[1]],
subject=regmatches(x, gregexpr('\\.[0-9]{1}', x))[[1]],
read.table(x, header=FALSE, fill=TRUE),
stringsAsFactors = FALSE)
}
})
# COMBINE EACH DATA FRAME INTO ONE
df <- do.call(rbind, dfList)
# REMOVE PERIOD IN SUBJECT (NEEDED EARLIER FOR SPECIAL DIGIT)
df['subject'] <- sapply(df['subject'],
function(x) gsub("\\.", "", x))
You can try to use tryCatchwhich basically would give you a NULL instead of an error.
file <- tryCatch(read.table(file.names[i],header=FALSE, fill = TRUE), error=function(e) NULL))

Read in multiple txt files and create a list of it to access each file by accessing the list element in R

Being relatively new to R programming I am struggling with a huge data set of 16 text files (, seperated) saved in one dierctory. All the files have same number of columns and the naming convention, for example file_year_2000, file_year_2001 etc. I want to create a list in R where i can access each file individually by accessing the list elementts. By searching through the web i found some code and tried the following but as a result i get one huge list (16,2 MB) where the output is just strange. I would like to have 16 elements in the list each represting one file read from the directory. I tried the following code but it does not work as i want:
path = "~/.../.../.../Data_1999-2015"
list.files(path)
file.names <- dir(path, pattern =".txt")
length(file.names)
df_list = list()
for( i in length(file.names)){
file <- read.csv(file.names[i],header=TRUE, sep=",", stringsAsFactors=FALSE)
year = gsub('[^0-9]', '', file)
df_list[[year]] = file
}
Any suggestions?
Thanks in advance.
Just to give more details
path = "~/.../.../.../Data_1999-2015"
list.files(path)
file.names <- dir(path, pattern =".txt")
length(file.names)
df_list = list()
for(i in seq(length(file.names))){
year = gsub('[^0-9]', '', file.names[i])
df_list[[year]] = read.csv(file.names[i],header=TRUE, sep=",", stringsAsFactors=FALSE)
}
Maybe it would be worth joining the data frames into one big data frame with an additional column being the year?
I assume that instead of "access each file individually" you mean you want to access individually data in each file.
Try something like this (untested):
path = "~/.../.../.../Data_1999-2015"
file.names <- dir(path, pattern =".txt")
df_list = vector("list", length(file.names))
# create a list of data frames with correct length
names(df_list) <- rep("", length(df_list))
# give it empty names to begin with
for( i in seq(along=length(file.names))) {
# now i = 1,2,...,16
file <- read.csv(file.names[i],header=TRUE, sep=",", stringsAsFactors=FALSE)
df_list[[i]] = file
# save the data
year = gsub('[^0-9]', '', file.names[i])
names(df_list)[i] <- year
}
Now you can use either df_list[[1]] or df_list[["2000"]] for year 2000 data.
I am uncertain if you are reading yout csv files in the right directory. If not, use
file <- read.csv(paste0(path, file.names[i], sep="/"),header=TRUE, sep=",", stringsAsFactors=FALSE)
when reading the file.

R script to read / execute many .csv file from command line and write all the results in a .csv file

I have many .csv files in a folder. I want to get the binning result from each of the .csv file one by one automatically by R scripting from command line, and one by one write the result of all files into result.csv file. For example, I have file01.csv, file02.csv, file03.csv, file04.csv, file05.csv. I want that first R script will read / execute file01.csv and write the result into result.csv file, then read / execute file02.csv and write result into result.csv, again read / execute file03.csv and write result into result.csv, and so on. This is like a loop on all the files, and I want to execute the R script from the command line.
Here is my starting R script:
data <- read.table("file01.csv",sep=",",header = T)
df.train <- data.frame(data)
library(smbinning) # Install if necessary
<p>#Analysis by dwell:</p>
df.train_amp <-
rbind(df.train)
res.bin <- smbinning(df=df.train_amp, y="cvflg",x="dwell")
res.bin #Result
<p># Analysis by pv</p>
df.train_amp <-
rbind(df.train)
res.bin <- smbinning(df=df.train_amp, y="cvflg",x="pv")
res.bin #Result
Any suggestion and support would be appreciated highly.
Thank
Firstly you will want to read in the files from your directory. Place all of your source files in the same source directory. I am assuming here that your CSV files all have the same shape. Also, I am doing nothing about headers here.
directory <- "C://temp" ## for example
filenames <- list.files(directory, pattern = "*.csv", full.names = TRUE)
# If you need full paths then change the above to
# filenames <- list.files(directory, pattern = "*.csv", full.names = TRUE)
bigDF <- data.frame()
for (f in 1:length(filenames)){
tmp <- read.csv(paste(filenames[f]), stringsAsFactors = FALSE)
bigDF <- rbind(bigDF, tmp)
}
This will add the rows in tmp to bigDF for each read, and should result in final bigDF.
To write the df to a csv is trivial in R as well. Anything like
# Write to a file, suppress row names
write.csv(bigDF, "myData.csv", row.names=FALSE)
# Same, except that instead of "NA", output blank cells
write.csv(bigDF, "myData.csv", row.names=FALSE, na="")
# Use tabs, suppress row names and column names
write.table(bigDF, "myData.csv", sep="\t", row.names=FALSE, col.names=FALSE)
Finally I find the above problem can be solved as follows:
library(smbinning) #Install if necessary。
files <- list.files(pattern = ".csv") ## creates a vector with all files names in your folder
cutpoint <- rep(0,length(files))
for(i in 1:length(files)){
data <- read.csv(files[i],header=T)
df.train <- data.frame(data)
df.train_amp <- rbind(df.train,df.train,df.train,df.train,df.train,df.train,df.train,df.train) # Just to multiply the data
cutpoint[i] <- smbinning(df=df.train_amp, y="cvflg",x="dwell") # smbinning is calculated here
}
result <- cbind(files,cutpoint) # Produce details results
result <- cbind(files,bands) # Produce bands results
write.csv(result,"result_dwell.csv") # write result into csv file

Resources