Error when during merging excel files in r with blank sheet - r

I'm using the following code to merge several excel files with multiple sheets. I get an error when it runs across a sheet that has the same header as the other files but is not populated with data. This is the error:
Error in data.frame(sub.id, condition, s.frame, ss) :
arguments imply differing number of rows: 0, 2
How can I avoid the error? Here is the code I am using below.
file.names <- list.files(pattern='*.xls')
sheet.names <- getSheets(loadWorkbook('File.xls'))
sheet.names <-sheet.names[1:12]
e.names <- paste0(rep('v', 16), c(1:16))
data.1 <- data.frame(matrix(rep(NA,length(e.names)),
ncol = length(e.names)))
names(data.1) <- e.names
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
rm(wb)
}

I suppose this solution will work for you. It loads all .xlsx files in a specified folder into a list of lists. Sheet-names and -headers shouldn't be an issue.
library(openxlsx)
# Define folder where your files are
path_folder <- "C:/path_to_files/"
# load file names into a list
f <- list.files(path_folder)
f <- ifelse(substring(f,nchar(f)-4,nchar(f))==".xlsx",f,NA)
f <- f[!is.na(f)]
data_list <- as.list(f)
# get sheet-names
names(data_list) <- data_list
data_list <- lapply(data_list, function(x){getSheetNames(paste0(path_folder, x))})
# load data into a list of lists
data_list <- lapply(data_list, function(x){as.list(x)})
data_list <- lapply(names(data_list),function(x){
sapply(data_list[[x]],function(y){read.xlsx(paste0(path_folder, x),sheet=y)})
})
# name the list elements
names(data_list) <- gsub(".xlsx", "", f)
You end up with a list (containing each file) of lists (containing the sheets of each file).
From here you can remove empty sheets, merge and edit them as you like.

Added an if-statement to check if there was more than one row if not skip reading in and it resolved the error.
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
if (nrow(ss) > 1)
{
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
}
rm(wb)
}

Related

List elements getting overwritten in for loop R?

I have a bunch of csv files that I'm trying to read into R all at once, with each data frame from a csv becoming an element of a list. The loops largely work, but they keep overriding the list elements. So, for example, if I loop over the first 2 files, both data frames in list[[1]] and list[[2]] will contain the data frame for the second file.
#function to open one group of files named with "cores"
open_csv_core<- function(year, orgtype){
file<- paste(year, "/coreco.core", year, orgtype, ".csv", sep = "")
df <- read.csv(file)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C","D"),]
df<- df[!(df$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df)
}
#function to open one group of files named with "nccs"
open_csv_nccs<- function(year, orgtype){
file2<- paste(year, "/nccs.core", year, orgtype, ".csv", sep="")
df2 <- read.csv(file2)
names(df2) <- tolower(names(df2))
df2 <- df2[df2$ntee1 %in% c("C","D"),]
df2<- df2[!(df2$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df2)
}
#############################################################################
yrpc<- list()
yrpf<- list()
yrco<- list()
fname<- vector()
file_yrs<- as.character(c(1989:2019))
for(i in 1:length(file_yrs)){
fname<- list.files(path = file_yrs[i], pattern = NULL)
#accessing files in a folder and assigning to the proper function to open them based on how the file is named
for(j in 1:length(fname)){
if(grepl("pc.csv", fname[j])==T) {
if(grepl("nccs", fname[j])==T){
a <- open_csv_nccs(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- a
} else {
b<- open_csv_core(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- b
}
} else if (grepl("pf.csv", fname[j])==T){
if(grepl("nccs", fname[j])==T){
c <- open_csv_nccs(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- c
} else {
d<- open_csv_core(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- d
}
} else {
if(grepl("nccs", fname[j])==T){
e<- open_csv_nccs(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- e
} else {
f<- open_csv_core(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- f
}
}
}
}
Actually, both of your csv reading functions do exactly the same,
except that the paths are different.
If you find a way to list your files with abstract paths instead of relative
paths (just the file names), you wouldn't need to reconstruct the paths like
you do. This is possible by full.names = TRUE in list.files().
The second point is, it seems there is never from same year and same type
a "nccs.core" file in addition to a "coreco.core" file. So they are mutually
exclusive. So then, there is no logics necessary to distinguish those cases, which simplifies our code.
The third point is, you just want to separate the data frames by filetype ("pc", "pf", "co") and years.
Instead of creating 3 lists for each type, I would create one res-ults list, which contains for each type an inner list.
I would solve this like this:
years <- c(1989:2019)
path_to_type <- function(path) gsub(".*(pc|pf|co)\\.csv", "\\1", path)
res <- list("pc" = list(),
"pf" = list(),
"co" = list())
lapply(years, function(year) {
files <- list.files(path = year, pattern = "\\.csv", full.names = TRUE)
dfs <- lapply(files, function(path) {
print(path) # just to signal that the path is getting processed
df <- read.csv(path)
file_type <- path_to_type(path)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C", "D"), ]
df <- df[!(df$nteecc %in% c("D20", "D40", "D50", "D60", "D61")), ]
res[[file_type]][[year]] <- df
})
})
Now you can call from result's list by file_type and year
e.g.:
res[["co"]][[1995]]
res[["pf"]][[2018]]
And so on.
Actually, the results of the lapply() calls in this case are
not interesting. Just the content of res ... (result list).
It seems that in your for(j in 1:length(fname)){... you are creating one of 4 variable a, b, c or d. And you're reusing these variable names, so they are getting overwritten.
The "correct" way to do this is to use lapply in place of the for loop. Pass the list of files, and the required function (i.e. open_csv_core, etc) to lapply, and the return value that you get back is a list of the results.

how to loop through multiple dataframes and write into multiple sheets of excel in R?

i want to write into multiple sheets of excel using loop. code is mentioned below.
first_column <- c("value_1", "value_2")
second_column <- c("ve_1", "ve_2")
fir_column <- c("1", "2")
se_column <- c("a1", "va2")
df <- data.frame(first_column, second_column) #sheet A of df.xlsx
df1 <- data.frame(fir_column, se_column) #sheet B of df.xlsx
sheets<-openxlsx::getSheetNames('./Out/df.xlsx')
for(i in 1:length(sheets)){
df<-read_excel('./Out/df.xlsx',sheet = sheets[i])
write.xlsx(df, './Out/df1.xlsx', sheetName = i, append=TRUE)
}
Only last sheet is getting generated.
Using your code (note pathnames are edited slightly):
library(xlsx)
library(readxl)
sheets<-openxlsx::getSheetNames('./df.xlsx')
for(i in 1:length(sheets)){
df<-readxl::read_excel('./df.xlsx',sheet = sheets[i])
appendSheet <- i > 1
# depending on version might need to do as.data.frame(df)
xlsx::write.xlsx(df, './df1.xlsx', sheetName = as.character(i), append=appendSheet)
}
Or with openxlsx:
library(readxl)
library(openxlsx)
sheets<-openxlsx::getSheetNames('./df.xlsx')
wb <- createWorkbook()
for(i in 1:length(sheets)){
df<-readxl::read_excel('./df.xlsx',sheet = sheets[i])
addWorksheet(wb, i)
writeData(wb, i, df)
}
saveWorkbook(wb, "df1.xlsx", overwrite = T)
Alternatively, you can work with the sheets as list elements, allowing you to easily work over each element with the apply family of functions:
library(rio)
library(writexl)
list_of_dfs <- import_list("df.xlsx") # from rio
write_xlsx(list_of_dfs, "df1.xlsx") # from writexl

comparing excel sheets in R

I have two excel files with multiple sheets. The sheet names and their corresponding column names are same of both the files. Only the values in the sheets will differ. I want to compare using R which values are different and want to mark those cells
For the fun of it, here's a quick&dirty example on which you can build up for your specific needs:
wbsCreate <- function(v) {
wb <- createWorkbook()
sheet <- createSheet(wb, "Sheet1")
rows <- createRow(sheet, rowIndex=1:5)
cells <- createCell(rows, colIndex=1:5)
for (r in 1:5)
for (c in 1:5)
setCellValue(cells[[r, c]], value = v[(r-1)*5+c])
saveWorkbook(wb, tf <- tempfile(fileext = ".xlsx"))
return(tf)
}
wbsMarkDiff <- function(fn1, fn2) {
fns <- c(fn1, fn2)
wb <- lapply(fns, loadWorkbook)
cs <- lapply(wb, function(x) CellStyle(x) +
Fill(backgroundColor="red",
foregroundColor="red",
pattern="SOLID_FOREGROUND"))
sheets <- lapply(wb, getSheets)
sheetnames <- do.call(intersect, lapply(sheets, names))
for (sheetname in sheetnames) {
sheet <- lapply(sheets, "[[", sheetname)
rows <- lapply(sheet, getRows)
cells <- lapply(rows, getCells)
values <- lapply(cells, function(cell) lapply(cell, getCellValue))
idx <- names(which(!mapply(identical, values[[1]], values[[2]])))
for (s in 1:2)
for (i in idx)
setCellStyle(cells[[s]][[i]], cs[[s]])
for (s in 1:2)
saveWorkbook(wb[[s]], fns[s])
}
}
library(xlsx)
# create to excel workbooks (same dimensions per sheet)
v <- LETTERS[1:25]
tf1 <- wbsCreate(v)
v[c(3,6,9)] <- letters[c(3,6,9)]
tf2 <- wbsCreate(v)
# mark differences
wbsMarkDiff(tf1, tf2)
shell.exec(tf1) # open file1 on windows
shell.exec(tf2) # open file2 on windows
You get help on each command by using ?, for example ?createWorkbook gives you the help files on that function.

Loop through an array and download web data from a link?

I'm trying to download data from Table2 from this link.
http://finviz.com/quote.ashx?t=aapl
I'd like to write all the elements in an array into a CSV file but I can't even get the loop working. The paste function seems unnecessary, but that's all I could find after several Google searches.
library('rvest')
stocks <- c("AXP","BA","CAT","CSCO")
for (i in 1 : length(stocks))
{
url <- sprintf(paste("http://finviz.com/quote.ashx?t=", stocks));
x<-read_html(paste("http://finviz.com/quote.ashx?t=", stocks))
html_table( x %>% html_nodes("table.snapshot-table2"))
}
Thanks for the help with this.
Unfamiliar with rvest package, you could however handle your html table extract with R's XML package, specifically its readHTMLTable function.
library(XML)
stocks <- c("AXP","BA","CAT","CSCO")
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
# ASSIGN TO STOCK NAMED DFS
assign(s, readHTMLTable(tableNodes[[9]],
header= c("data1", "data2", "data3", "data4", "data5", "data6",
"data7", "data8", "data9", "data10", "data11", "data12")))
# ADD COLUMN TO IDENTIFY STOCK
df <- get(s)
df['stock'] <- s
assign(s, df)
}
# COMBINE ALL STOCK DATA
stockdatalist <- cbind(mget(stocks))
stockdata <- do.call(rbind, stockdatalist)
# MOVE STOCK ID TO FIRST COLUMN
stockdata <- stockdata[, c(ncol(stockdata), 1:ncol(stockdata)-1)]
# SAVE TO CSV
write.table(stockdata, "C:\\Path\\To\\StockData.csv", sep=",",
row.names=FALSE, col.names=FALSE)
# REMOVE TEMP OBJECTS
rm(df, stockdatalist)
To extract every stock indicator with its corresponding value as shown on website and assign to separate lists, consider using XML's xpathSApply() to XPath the needed table cell and row. Then append each returned xml value into list and combine lists to final dataframe:
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
for (row in 1:12) {
for (col in seq(1, 12, 2)) {
if (s == "AXP") {
assign(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1],
xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col+1), xmlValue)[1])
} else {
stocklist <- get(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1])
stocklist <- c(stocklist,
xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col+1), xmlValue)[1])
assign(xpathSApply(html, sprintf("//table[9]/tr[%d]/td[%d]", row, col), xmlValue)[1],
stocklist)
}
}
}
}
rm(stocklist, stock, url, html, webpage, row, col, s) # REMOVE TEMP OBJECTS
`EPS next Y` <- c("2.90%","14.21%","-21.84%","5.63%") # CLEAN UP ONE LIST
# COMBINE LISTS INTO DATA FRAME
lists <- as.list(ls()) # LIST OF STOCK INDICATORS
df <- data.frame(stocks=unlist(stocks)) # CREATE DF
for (i in lists){
slist <- get(i)
df <- cbind(df, slist) # ADD DF COLS
}
names(df) <- c("Stock", lists) # RENAME COLUMNS

reading excel files into a single dataframe with readxl R

I have a bunch of excel files and I want to read them and merge them into a single data frame.
I have the following code:
library(readxl)
files <- list.files()
f <- list()
data_names <- gsub("[.]xls", "", files)
to read each excel file into data frames
for (i in 1:length(files)){
assign(data_names[i], read_excel(files[i], sheet = 1, skip = 6))
}
but, if I try to save it in a variable, just saved the last file
for (i in 1:length(files)){
temp <- read_excel(files[i], sheet = 1, skip = 6)
}
I would do this using plyr:
library(readxl)
library(plyr)
files <- list.files(".", "\\.xls")
data <- ldply(files, read_excel, sheet = 1, skip = 6)
If you wanted to add a column with the file name, you could instead do:
data <- ldply(files, function(fil) {
data.frame(File = fil, read_excel(fil, sheet = 1, skip = 6))
}
I would recommend to use the list enviourment in R, assign can be quite confusing and you can't determain values with GET.
Should look like this:
l <- list()
for (i in 1:length(files)){
l[[i]] <- read_excel(files[i], sheet = 1, skip = 6))
}
ltogether <- do.call("rbind",l)

Resources