I'm trying to download data from Table2 from this link.
http://finviz.com/quote.ashx?t=aapl
I'd like to write all the elements in an array into a CSV file but I can't even get the loop working. The paste function seems unnecessary, but that's all I could find after several Google searches.
library('rvest')
stocks <- c("AXP","BA","CAT","CSCO")
for (i in 1 : length(stocks))
{
url <- sprintf(paste("http://finviz.com/quote.ashx?t=", stocks));
x<-read_html(paste("http://finviz.com/quote.ashx?t=", stocks))
html_table( x %>% html_nodes("table.snapshot-table2"))
}
Thanks for the help with this.
Unfamiliar with rvest package, you could however handle your html table extract with R's XML package, specifically its readHTMLTable function.
library(XML)
stocks <- c("AXP","BA","CAT","CSCO")
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
# ASSIGN TO STOCK NAMED DFS
assign(s, readHTMLTable(tableNodes[[9]],
header= c("data1", "data2", "data3", "data4", "data5", "data6",
"data7", "data8", "data9", "data10", "data11", "data12")))
# ADD COLUMN TO IDENTIFY STOCK
df <- get(s)
df['stock'] <- s
assign(s, df)
}
# COMBINE ALL STOCK DATA
stockdatalist <- cbind(mget(stocks))
stockdata <- do.call(rbind, stockdatalist)
# MOVE STOCK ID TO FIRST COLUMN
stockdata <- stockdata[, c(ncol(stockdata), 1:ncol(stockdata)-1)]
# SAVE TO CSV
write.table(stockdata, "C:\\Path\\To\\StockData.csv", sep=",",
row.names=FALSE, col.names=FALSE)
# REMOVE TEMP OBJECTS
rm(df, stockdatalist)
To extract every stock indicator with its corresponding value as shown on website and assign to separate lists, consider using XML's xpathSApply() to XPath the needed table cell and row. Then append each returned xml value into list and combine lists to final dataframe:
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
for (row in 1:12) {
for (col in seq(1, 12, 2)) {
if (s == "AXP") {
assign(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1],
xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col+1), xmlValue)[1])
} else {
stocklist <- get(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1])
stocklist <- c(stocklist,
xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col+1), xmlValue)[1])
assign(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1],
stocklist)
}
}
}
}
rm(stocklist, stock, url, html, webpage, row, col, s) # REMOVE TEMP OBJECTS
`EPS next Y` <- c("2.90%","14.21%","-21.84%","5.63%") # CLEAN UP ONE LIST
# COMBINE LISTS INTO DATA FRAME
lists <- as.list(ls()) # LIST OF STOCK INDICATORS
df <- data.frame(stocks=unlist(stocks)) # CREATE DF
for (i in lists){
slist <- get(i)
df <- cbind(df, slist) # ADD DF COLS
}
names(df) <- c("Stock", lists) # RENAME COLUMNS
Related
I have a bunch of csv files that I'm trying to read into R all at once, with each data frame from a csv becoming an element of a list. The loops largely work, but they keep overriding the list elements. So, for example, if I loop over the first 2 files, both data frames in list[[1]] and list[[2]] will contain the data frame for the second file.
#function to open one group of files named with "cores"
open_csv_core<- function(year, orgtype){
file<- paste(year, "/coreco.core", year, orgtype, ".csv", sep = "")
df <- read.csv(file)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C","D"),]
df<- df[!(df$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df)
}
#function to open one group of files named with "nccs"
open_csv_nccs<- function(year, orgtype){
file2<- paste(year, "/nccs.core", year, orgtype, ".csv", sep="")
df2 <- read.csv(file2)
names(df2) <- tolower(names(df2))
df2 <- df2[df2$ntee1 %in% c("C","D"),]
df2<- df2[!(df2$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df2)
}
#############################################################################
yrpc<- list()
yrpf<- list()
yrco<- list()
fname<- vector()
file_yrs<- as.character(c(1989:2019))
for(i in 1:length(file_yrs)){
fname<- list.files(path = file_yrs[i], pattern = NULL)
#accessing files in a folder and assigning to the proper function to open them based on how the file is named
for(j in 1:length(fname)){
if(grepl("pc.csv", fname[j])==T) {
if(grepl("nccs", fname[j])==T){
a <- open_csv_nccs(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- a
} else {
b<- open_csv_core(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- b
}
} else if (grepl("pf.csv", fname[j])==T){
if(grepl("nccs", fname[j])==T){
c <- open_csv_nccs(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- c
} else {
d<- open_csv_core(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- d
}
} else {
if(grepl("nccs", fname[j])==T){
e<- open_csv_nccs(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- e
} else {
f<- open_csv_core(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- f
}
}
}
}
Actually, both of your csv reading functions do exactly the same,
except that the paths are different.
If you find a way to list your files with abstract paths instead of relative
paths (just the file names), you wouldn't need to reconstruct the paths like
you do. This is possible by full.names = TRUE in list.files().
The second point is, it seems there is never from same year and same type
a "nccs.core" file in addition to a "coreco.core" file. So they are mutually
exclusive. So then, there is no logics necessary to distinguish those cases, which simplifies our code.
The third point is, you just want to separate the data frames by filetype ("pc", "pf", "co") and years.
Instead of creating 3 lists for each type, I would create one res-ults list, which contains for each type an inner list.
I would solve this like this:
years <- c(1989:2019)
path_to_type <- function(path) gsub(".*(pc|pf|co)\\.csv", "\\1", path)
res <- list("pc" = list(),
"pf" = list(),
"co" = list())
lapply(years, function(year) {
files <- list.files(path = year, pattern = "\\.csv", full.names = TRUE)
dfs <- lapply(files, function(path) {
print(path) # just to signal that the path is getting processed
df <- read.csv(path)
file_type <- path_to_type(path)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C", "D"), ]
df <- df[!(df$nteecc %in% c("D20", "D40", "D50", "D60", "D61")), ]
res[[file_type]][[year]] <- df
})
})
Now you can call from result's list by file_type and year
e.g.:
res[["co"]][[1995]]
res[["pf"]][[2018]]
And so on.
Actually, the results of the lapply() calls in this case are
not interesting. Just the content of res ... (result list).
It seems that in your for(j in 1:length(fname)){... you are creating one of 4 variable a, b, c or d. And you're reusing these variable names, so they are getting overwritten.
The "correct" way to do this is to use lapply in place of the for loop. Pass the list of files, and the required function (i.e. open_csv_core, etc) to lapply, and the return value that you get back is a list of the results.
I'm using the following code to merge several excel files with multiple sheets. I get an error when it runs across a sheet that has the same header as the other files but is not populated with data. This is the error:
Error in data.frame(sub.id, condition, s.frame, ss) :
arguments imply differing number of rows: 0, 2
How can I avoid the error? Here is the code I am using below.
file.names <- list.files(pattern='*.xls')
sheet.names <- getSheets(loadWorkbook('File.xls'))
sheet.names <-sheet.names[1:12]
e.names <- paste0(rep('v', 16), c(1:16))
data.1 <- data.frame(matrix(rep(NA,length(e.names)),
ncol = length(e.names)))
names(data.1) <- e.names
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
rm(wb)
}
I suppose this solution will work for you. It loads all .xlsx files in a specified folder into a list of lists. Sheet-names and -headers shouldn't be an issue.
library(openxlsx)
# Define folder where your files are
path_folder <- "C:/path_to_files/"
# load file names into a list
f <- list.files(path_folder)
f <- ifelse(substring(f,nchar(f)-4,nchar(f))==".xlsx",f,NA)
f <- f[!is.na(f)]
data_list <- as.list(f)
# get sheet-names
names(data_list) <- data_list
data_list <- lapply(data_list, function(x){getSheetNames(paste0(path_folder, x))})
# load data into a list of lists
data_list <- lapply(data_list, function(x){as.list(x)})
data_list <- lapply(names(data_list),function(x){
sapply(data_list[[x]],function(y){read.xlsx(paste0(path_folder, x),sheet=y)})
})
# name the list elements
names(data_list) <- gsub(".xlsx", "", f)
You end up with a list (containing each file) of lists (containing the sheets of each file).
From here you can remove empty sheets, merge and edit them as you like.
Added an if-statement to check if there was more than one row if not skip reading in and it resolved the error.
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
if (nrow(ss) > 1)
{
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
}
rm(wb)
}
I am trying to write a function to combine a csvs into 1 dataframe based on the name of the csv and then name the dataframe from the pattern that is input into the function. Everything work except returning the dataframe. I can't figure out how to do that using the function input as the name of the dataframe.
I tried what is in this post, but I think the issue is that I only know the name of the dataframe based on the function input: Return a data frame from function
#### Create files in your current working directory ####
dir <- getwd()
subDir <- 'temp'
dir.create(subDir)
setwd(file.path(dir, subDir))
dir.create('Run1')
dir.create('Run2')
employeeID <- c('123','456','789')
salary <- c(21000, 23400, 26800)
startdate <- as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
employeeID <- c('123','456','789')
first <- c('John','Jane','Tom')
last <- c('Doe','Smith','Franks')
data <- data.frame(employeeID,salary,startdate)
name <- data.frame(employeeID,first,last)
write.csv(data, file = "Run1/data.csv",row.names=FALSE, na="")
write.csv(name, file = "Run1/name.csv",row.names=FALSE, na="")
employeeID <- c('465','798','132')
salary <- c(100000, 500000, 300000)
startdate <- as.Date(c('2000-11-1','2001-3-25','2003-3-14'))
employeeID <- c('465','798','132')
first <- c('Jay','Susan','Tina')
last <- c('Jones','Smith','Thompson')
data <- data.frame(employeeID,salary,startdate)
name <- data.frame(employeeID,first,last)
write.csv(data, file = "Run2/data.csv",row.names=FALSE, na="")
write.csv(name, file = "Run2/name.csv",row.names=FALSE, na="")
#### function ####
files_to_df <- function(pattern){
# pattern <- "data"
filenames <- list.files(recursive = TRUE, pattern = pattern)
df_list <- lapply(filenames, read.csv, header = TRUE)
# Name each dataframe with the run and filename
names(df_list) <- str_sub(list, 1, 4)
# Create combined dataframe
df <- df_list %>%
bind_rows(.id = 'run')
# Assign dataframe to the name of the pattern
assign(pattern, df)
# Return the dataframe
return(data.frame(pattern))
#list2env(pattern,.GlobalEnv)
}
#### Run function ####
files_to_df(c("data"))
I made two changes of your code:
1.) str_sub(list, 1, 4) -> str_sub(filenames, 1, 4)
list is a function and dont contain any content.
2.) return(data.frame(pattern)) -> return(df)
returning the data.frame and not a sting.
files_to_df <- function(pattern){
# pattern <- "data"
filenames <- list.files(recursive = TRUE, pattern = pattern)
df_list <- lapply(filenames, read.csv, header = TRUE)
# Name each dataframe with the run and filename
names(df_list) <- str_sub(filenames, 1, 4)
# Create combined dataframe
df <- df_list %>%
bind_rows(.id = 'run')
# Assign dataframe to the name of the pattern
assign(pattern, df)
# Return the dataframe
return(data.frame(df))
#list2env(pattern,.GlobalEnv)
}
I have a number of csv files and my goal is to find the number of complete cases for a file or set of files given by id argument. My function should return a data frame with column id specifying the file and column obs giving the number of complete cases for this id. However, my function overwrites the previous value of nobs in each loop and the resulting data frame gives me only its last value. Do you have any idea how to get the value of nobs for each value of id?
myfunction<-function(id=1:20) {
files<-list.files(pattern="*.csv")
myfiles = do.call(rbind, lapply(files, function(x) read.csv(x,stringsAsFactors = FALSE)))
for (i in id) {
good<-complete.cases(myfiles)
newframe<-myfiles[good,]
cases<-newframe[newframe$ID %in% i,]
nobs<-nrow(cases)
}
clean<-data.frame(id,nobs)
clean
}
Thanks.
We can do all inside lapply(), something like below (not tested):
myfunction <- function(id = 1:20) {
files <- list.files(pattern = "*.csv")[id]
do.call(rbind,
lapply(files, function(x){
df <- read.csv(x,stringsAsFactors = FALSE)
df <- df[complete.cases(df), ]
data.frame(ID=x,nobs=nrow(df))
}
)
)
}
I am new to R and trying to do some correlation analysis on multiple sets of data. I am able to do the analysis, but I am trying to figure out how I can output the results of my data. I'd like to have output like the following:
NAME,COR1,COR2
....,....,....
....,....,....
If I could write such a file to output, then I can post process it as needed. My processing script looks like this:
run_analysis <- function(logfile, name)
{
preds <- read.table(logfile, header=T, sep=",")
# do something with the data: create some_col, another_col, etc.
result1 <- cor(some_col, another_col)
result1 <- cor(some_col2, another_col2)
# somehow output name,result1,result2 to a CSV file
}
args <- commandArgs(trailingOnly = TRUE)
date <- args[1]
basepath <- args[2]
logbase <- paste(basepath, date, sep="/")
logfile_pattern <- paste( "*", date, "csv", sep=".")
logfiles <- list.files(path=logbase, pattern=logfile_pattern)
for (f in logfiles) {
name = unlist(strsplit(f,"\\."))[1]
logfile = paste(logbase, f, sep="/")
run_analysis(logfile, name)
}
Is there an easy way to create a blank data frame and then add data to it, row by row?
Have you looked at the functions in R for writing data to files? For instance, write.csv. Perhaps something like this:
rs <- data.frame(name = name, COR1 = result1, COR2 = result2)
write.csv(rs,"path/to/file",append = TRUE,...)
I like using the foreach library for this sort of thing:
library(foreach)
run_analysis <- function(logfile, name) {
preds <- read.table(logfile, header=T, sep=",")
# do something with the data: create some_col, another_col, etc.
result1 <- cor(some_col, another_col)
result2 <- cor(some_col2, another_col2)
# Return one row of results.
data.frame(name=name, cor1=result1, cor2=result2)
}
args <- commandArgs(trailingOnly = TRUE)
date <- args[1]
basepath <- args[2]
logbase <- paste(basepath, date, sep="/")
logfile_pattern <- paste( "*", date, "csv", sep=".")
logfiles <- list.files(path=logbase, pattern=logfile_pattern)
## Collect results from run_analysis into a table, by rows.
dat <- foreach (f=logfiles, .combine="rbind") %do% {
name = unlist(strsplit(f,"\\."))[1]
logfile = paste(logbase, f, sep="/")
run_analysis(logfile, name)
}
## Write output.
write.csv(dat, "output.dat", quote=FALSE)
What this does is to generate one row of output on each call to run_analysis, binding them into a single table called dat (the .combine="rbind" part of the call to foreach causes row binding). Then you can just use write.csv to get the output you want.