I tried to download several thousand SEC files via the command:
download.file(link, folder, method = "internal", quiet = FALSE,
mode = "wb", cacheOK = TRUE,
extra = getOption("download.file.extra"))
After a while I get the following message that I cannot interpret:
https://dl.dropboxusercontent.com/u/4149177/Capture.PNG
It seems that the files are downloaded successfully, however I want to know what the message means.
Can you tell me what R tries to tell me?
Full code:
setInternet2(use = FALSE)
destinationfolder <- getwd()
startyear <- 2000
stopyear <- 2000
startquarter <- 1
stopquarter <- 2
filetype <- "10-Q"
func.getsecindexfile<- function(year, quarter) {
#### download the zipped index file from the SEC website
tf <- tempfile()
result <- try(download.file(url=paste("http://www.sec.gov/Archives/edgar/full-index/", year,"/QTR", quarter, "/company.zip",sep=""), destfile=tf))
#### if we didn't encounter and error downloading the file, parse it and return as a R data frame
if (!inherits(result, "try-error")) {
#### small function to remove leading and trailing spaces
trim <- function (string) {
string <- enc2native(string)
gsub("^\\s*(.*?)\\s*$","\\1", string, perl=TRUE)
}
#### read the downloaded file
raw.data <- readLines(con=(zz<- unz(description=tf, filename="company.idx")))
close(zz)
#### remove the first 10 rows
raw.data <- raw.data[11:length(raw.data)]
#### parse the downloaded file and return the extracted data as a data frame
company_name <- trim(substr(raw.data,1,62))
form_type <- trim(substr(raw.data,63,74))
cik <- trim(substr(raw.data,75,86))
date_filed <- as.Date(substr(raw.data,87,98))
file_name <- trim(substr(raw.data,99,150))
rm(raw.data)
return(data.frame(company_name, form_type, cik, date_filed, file_name))
}
else {return(NULL)}
}
#### add index files to database
func.addindexfiletodatabase <- function(data){
if (is.null(data)) return(NULL)
rs <- dbWriteTable(sqlite, "filings", data, append=TRUE)
return(rs)
}
dbGetQuery(sqlite, "DROP TABLE IF EXISTS filings")
for (year in startyear:stopyear){
for (quarter in startquarter:stopquarter){
func.addindexfiletodatabase(func.getsecindexfile(year, quarter))
}
}
selection <- paste("SELECT * FROM filings WHERE form_type IN ('", filetype, "')", sep = "")
index <- dbGetQuery(sqlite, selection)
pre <- c("ftp://ftp.sec.gov/")
index <- cbind(index,pre)
temp <- paste(index$pre, index$file_name, sep = "")
index <- cbind(index,temp)
index$name_new <- index$temp
index$name_new <- gsub("ftp://ftp.sec.gov/edgar/data/","",index$name_new)
index$name_new <- gsub("/","-",index$name_new)
name <- paste(index$name_new)
link <- paste(index$temp, sep = "")
index$pre <- NULL
index$temp <- NULL
#### define download function
func.download_files <- function(link,name) {
folder <- paste(destinationfolder, "\\", name, sep="")
download.file(link, folder, method="internal", quiet = FALSE, mode = "wb", cacheOK = TRUE, extra = getOption("download.file.extra"))
}
#### download the files
mapply(FUN = func.download_files,link=link,name=name)
The "error" was a notification that the files was successfully downloaded. Thank your for your help.
Related
I am generating a function to download a bunch of CSV database from a "click and download" webpage. It was working wonderful:
mydownloadBCA <- function(start_date, end_date) {
start_date <- as.Date(start_date)
end_date <- as.Date(end_date)
dates <- as.Date("1970-01-01") + (start_date : end_date)
for(i in 1:length(dates)){
string_date <- as.character(dates[i])
myfile <- paste0("./BCA/BCA", string_date, ".csv")
myurl <- paste("https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20BCA%20MDA%20Dia%20", string_date, "%20v2017%2003%2022_09%2033%2019.csv", sep = "")
download.file(url = myurl, destfile = myfile, quiet = TRUE)
}
}
For a first "chunk" the url only varies given a date:
[2016-01-29] https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20SIN%20MDA%20Dia%202016-01-29%20v2017%2003%2022_10%2033%2019.csv
[2016-10-31]
https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20SIN%20MDA%20Dia%202016-10-31%20v2017%2003%2022_10%2033%2019.csv
Afterwards the webpage has been updated on a daily basis generating a changing url without a pattern.
After 2017-03-30 the url not only changed in an as.Date basis but in a numerical non-logical way. The problem is the last part "%XXXX%XXXX_XX%XXXX%XXXX.csv"
for example:
<<url>> = https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20BCA%20MDA
2017-03-30 <<url>>%20Dia%202017-03-30%20v2017%2003%2029_13%2029%2051.csv
2017-04-01 <<url>>%20Dia%202017-04-01%20v2017%2003%2031_13%2044%2042.csv
2017-04-02 <<url>>%20Dia%202017-04-02%20v2017%2004%2001_12%2057%2041.csv
## Problems here ^^^^^^^^^^^^^^^^^^^^^^
I tried to account for it with a loop but so far it has not been working:
mydownloadSIN <- function(start_date, end_date) {
start_date <- as.Date(start_date)
end_date <- as.Date(end_date)
dates <- as.Date("1970-01-01") + (start_date : end_date)
f <- factor(0:9)
number <- as.numeric(f)
for(i in 1:length(dates)){
for(j in 0:length(number)){
string_date <- as.character(dates[i])
X <- as.character(number[j])
myfile <- paste0("./SIN/SIN", string_date, ".csv")
myurl <- paste("https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20SIN%20MDA%20Dia%20",string_date,"%20v2017%",X,X,X,X,"%",X,X,X,X,"_",X,X,"%",X,X,X,X,"%",X,X,X,X,".csv", sep = "")
download.file(url = myurl, destfile = myfile, quiet = TRUE)
}
}
}
When trying using the function I get the following error:
Error in download.file(url = myurl, destfile = myfile, quiet = TRUE) :
cannot open URL
'https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20SIN%20MDA%20Dia%202017-03-24%20v2017%%_%%.csv'
In addition: Warning message: In download.file(url = myurl, destfile =
myfile, quiet = TRUE) : cannot open URL
'https://www.cenace.gob.mx/DocsMEM/OpeMdo/CantidAsig/MDA/ImportacionExportacion/Resultados_ImpExp%20SIN%20MDA%20Dia%202017-03-24%20v2017%%_%%.csv':
HTTP status was '400 Bad Request'
This is the "general" webpage where the user select the years they need and then click on the CSV archive:
https://www.cenace.gob.mx/SIM/VISTA/REPORTES/H_RepCantAsignadas.aspx?N=135&opc=divCssCantAsig&site=Cantidades%20asignadas/MDA/De%20Importación%20y%20Exportación&tipoArch=C&tipoUni=BCN&tipo=De%20Importación%20y%20Exportación&nombrenodop=MDA
Is there a way I can account for this change in the url for my function?
Thanks
I have 44 doc files. From each file, I need to extract the customer name and amount. I am able to this for one file using the read_document command and using the grep to extract the amount and customer name. When I do this for 44 files, I am getting an error. Not sure where I am wrong:
ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
for (i in length(files)){
library(textreadr)
read_document(files[i])
}
Here is the full code that I run on one file:
file <- "~/customer_full_file.docx"
library(textreadr)
full_customer_file <- read_document(file, skip = 0, remove.empty = TRUE, trim = TRUE)
#checking file is read correctly
head(full_customer_file)
tail(full_customer_file)
# Extracting Name
full_customer_file <- full_customer_file[c(1,4)]
amount_extract <- grep("Amount", full_customer_file, value = T)
library(tm)
require(stringr)
amount_extract_2 <- lapply(amount_extract, stripWhitespace)
amount_extract_2 <- str_remove(marks_extract_2, "Amount")
name_extract <- grep("Customer Name and ID: ", full_customer_file, value = T)
name_extract
name_extract_2 <- lapply(name_extract, stripWhitespace)
name_extract_2 <- str_remove(name_extract_2, "Customer Name and ID: ")
name_extract_2 <- as.data.frame(name_extract_2)
names(name_extract_2)[1] <- paste("customer_full_name")
amount_extract_2 <- as.data.frame(amount_extract_2)
names(amount_extract_2)[1] <- paste("amount")
amount_extract_2
customer_final_file <- cbind(name_extract_2, amount_extract_2)
write.table(customer_final_file, "~/customer_amount.csv", sep = ",", col.names = T, append = T)
Here is the code that I run on 44 file
ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
library(textreadr)
for (i in 1:length(files)){
read_document(files[i])
}
Here is the error that I am getting:
> library(textreadr)
> for (i in 1:length(files)){
+ read_document(files[i])
+ }
Warning messages:
1: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
2: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
3: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
4: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
5: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
I could give you my code, which I used to analyze different word files through the sentimentr package in R. I guess you can use the same structure that I have and just change the for in function to loop the extraction for every docx.
And this is the code:
library(sentimentr)
folder_path <- "C:\\Users\\yourname\\Documents\\R\\"
# Get a list of all the docx files in the folder
docx_files <- list.files(path = folder_path, pattern = "\\.docx$", full.names = TRUE)
# Create an empty data frame to store the results
results <- data.frame(file = character(0), sentiment = numeric(0))
# Loop over the list of files
for (file in docx_files) {
# Read the docx file
sample_data <- read_docx(file)
# Extract the content and create a summary
content <- docx_summary(sample_data)
law <- content[sapply(strsplit(as.character(content$text),""),length)>5,]
# Calculate the sentiment of the summary (or in your case extraction)
sentiment <- sentiment_by(as.character(law$text))
# Add a row to the data frame with the results for this file
results <- rbind(results, data.frame(file = file, sentiment = sentiment$ave_sentiment))
}
# View the results data frame
View(results)
I hope that is near enough to your problem to solve it
This program works because I made the varibles inisde lapply global by using the <<- operator. However, it does not work with the real files in the real program. These are .tsv files whith named columns. The answer I get when I run the real program is: Error: (converted from warning) Error in : (converted from warning) Error in : arguments imply differing number of rows: 3455, 4319. What might be causing this?
lc <- list("test.txt", "test.txt", "test.txt", "test.txt")
lc1 <- list("test.txt", "test.txt", "test.txt")
lc2 <- list("test.txt", "test.txt")
#list of lists. The lists contain file names
lc <- list(lc, lc1, lc2)
#new names for the three lists in the list of lists
new_dataFns <- list("name1", "name2", "name3")
file_paths <- NULL
new_path <- NULL
#add the file names to the path and read and merge the contents of each list in the list of lists
lapply(
lc,
function(lc) {
filenames <- file.path(getwd(), lc)
dataList <<- lapply(filenames, function (lc) read.table(file=lc, header=TRUE))
dataList <<- lapply(dataList, function(dataList) {merge(as.data.frame(dataList),as.data.frame(dataList))})
}
)
#add the new name of the file to the path total will be 3 paths/fille_newname.tsv.
lapply(new_dataFns, function(new_dataFns) {new_path <<- file.path(getwd(), new_dataFns)})
print(new_path)
print(dataList)
finalFiles <- merge(as.data.frame(dataList), as.data.frame(new_path))
print(finalFiles)
I found a solution to the problem by writing a different type of code. Please see below. The input to the function is provided by the app input widgets
glyCount1 <- function(answer = NULL, fileChoice = NULL, combination = NULL, enteredValue = NULL, nameList) {
lc = nameList
new_dataFns <- gsub(" ", "", nameList)
first_path <- NULL
new_path <- NULL
old_path <- NULL
file_content <- NULL
for(i in 1:length(lc)){
for(j in 1:length(lc[[i]])){
if(!is.null(lc[[i]])){
first_path[[j]]<- paste(getwd(), "/", lc[[i]][j], sep = "")
tryCatch(file_content[[j]] <- read.csv(file = first_path[[i]], header = TRUE, sep = ","), error = function(e) NULL)
old_path[[j]] <- paste(getwd(), "/", i, ".csv", sep = "")
write.table(file_content[[j]], file = old_path[[j]], append = TRUE, col.names = FALSE)
}
}
}
}
I have a folder with about 700 text files that I want to import and add a column to. I've figured out how to do this using the following code:
files = list.files(pattern = "*c.txt")
DF <- NULL
for (f in files) {
data <- read.table(f, header = F, sep=",")
data$species <- strsplit(f, split = "c.txt") <-- (column name is filename)
DF <- rbind(DF, data)
}
write.xlsx(DF,"B:/trends.xlsx")
Problem is, there are about 100 files that are empty. so the code stops at the first empty file and I get this error message:
Error in read.table(f, header = F, sep = ",") :
no lines available in input
Is there a way to skip over these empty files?
You can skip empty files by checking that file.size(some_file) > 0:
files <- list.files("~/tmp/tmpdir", pattern = "*.csv")
##
df_list <- lapply(files, function(x) {
if (!file.size(x) == 0) {
read.csv(x)
}
})
##
R> dim(do.call("rbind", df_list))
#[1] 50 2
This skips over the 10 files that are empty, and reads in the other 10 that are not.
Data:
for (i in 1:10) {
df <- data.frame(x = 1:5, y = 6:10)
write.csv(df, sprintf("~/tmp/tmpdir/file%i.csv", i), row.names = FALSE)
## empty file
system(sprintf("touch ~/tmp/tmpdir/emptyfile%i.csv", i))
}
For a different approach that introduces explicit error handling, think about a tryCatch to handle anything else bad that might happen in your read.table.
for (f in files) {
data <- tryCatch({
if (file.size(f) > 0){
read.table(f, header = F, sep=",")
}
}, error = function(err) {
# error handler picks up where error was generated
print(paste("Read.table didn't work!: ",err))
})
data$species <- strsplit(f, split = "c.txt")
DF <- rbind(DF, data)
}
I have a problems making R read a set of files in a folder and returning cross product of them.
I have a folder which contains one test.csv file and n train.csv files.
I need a loop to read though on folder and return a file that contain the cross product of test and each of the train files… so the rows of file should look like this.
test*train01
test*train02
test*train03
...
I wrote a script to make that for two defined line but don’t know how to adapt that for the whole folder and the pattern that I need.
data01 <- as.matrix(read.csv(file = "test.csv", sep = ",", header=FALSE))
data02 <- as.matrix(read.csv(file = "train.csv", sep = ",", header=FALSE))
test <- list()
test01<- list()
test02<- list()
i<- 1
while (i <= 25){
test01[[i]] <- c(data01[i, ])
test02[[i]] <- c(data02[i, ])
test[[i]]<- crossprod(test01[[i]],test02[[i]])
i <- i+1
}
write.csv(test, file="testing.csv", row.names = FALSE)
Try:
test <- function(data) {
data01 <- as.matrix(read.csv(file = "test.csv", sep = ",", header=FALSE))
data02 <- as.matrix(read.csv(file = data, sep = ",", header=FALSE))
test <- list()
test01<- list()
test02<- list()
i<- 1
while (i <= 25){
test01[[i]] <- c(data01[i, ])
test02[[i]] <- c(data02[i, ])
test[[i]]<- crossprod(test01[[i]],test02[[i]])
i <- i+1
}
return(test)
}
result <- lapply(list.files(pattern='Train.*'),test)
Then just loop result to save in CSV file.
EDIT: How to save:
files <- list.files(pattern='Train.*')
for (i in seq(length(result))) {
write.csv(result[[i]], paste0('result_',files[i]), row.names = FALSE)
}
EDIT: Saving in one file:
write.csv(do.call(rbind,result),'result.csv', row.names = FALSE) # Appending by row
or
write.csv(do.call(cbind,result),'result.csv', row.names = FALSE) # Appending by column