XLConnect sheet names too long - r

The code below asks the user for a path to .csv files, makes a list of the .csv filenames, then writes the contents of each .csv file to a sheet, in one .xlsx file. Each sheet is named after the original name of the .csv file.
My problem is that some of my .csv filenames are over 31 characters long, which is the limit for sheet names in Excel.
I want to put the sheet name in cell A1, write the contents of the file below that and give the sheets a name (1, 2, 3..., for instance), but I can't wrap my head around how I would accomplish this. Any suggestions would be greatly appreciated.
library(data.table) ## for fast fread() function
library(XLConnect)
library(svDialogs)
# Ask user for path to csv files
folder <- dlgInput(title = "Merge csv", "Enter path to csv files (use '/' instead of '\\': ", Sys.info()["user"])$res
setwd(folder)
# Create and load Excel file
wb <- loadWorkbook("Output.xlsx", create=TRUE)
# Get list of csv files
pattern.ext <- "\\.csv$"
files <- dir(folder, full=TRUE, pattern=pattern.ext)
# Use file names for sheet names
files.nms <- basename(files)
files.nms <- gsub(pattern.ext, "", files.nms)
# Set the names to make them easier to grab
names(files) <- files.nms
# Iterate over each csv and output to sheet in Excel with its name
for (nm in files.nms) {
# Ingest csv file
temp_DT <- fread(files[[nm]])
# Create the sheet with the name
createSheet(wb, name=nm)
# Output the contents of the csv
writeWorksheet(object=wb, data=temp_DT, sheet=nm, header=TRUE, rownames=NULL)
}
# Remove default sheets
removeSheet(wb, sheet = "Sheet1")
removeSheet(wb, sheet = "Sheet2")
removeSheet(wb, sheet = "Sheet3")
saveWorkbook(wb)
# Check to see if file exists
if (file.exists("Output.xlsx")) {
dlg_message("Your Excel file has been created.")$res
} else {
dlg_message("Error: Your file was not created. Please try again.")$res
}
EDIT - ALTERNATE SOLUTION:
Modified code to initially take substring of the file names...
# Use substring of file names for sheet names (Excel sheet name limit is 31)
files.nms <- substr(basename(files),1,31)
files.nms <- gsub(pattern.ext, "", files.nms)
FINAL EDIT: Added welcome message and function ask user for path, which checks if the path exists. If it does not, it will continually ask user until they cancel or enter an existing directory.
library(data.table) # fread() function
library(XLConnect) # Excel and csv files
library(svDialogs) # Dialog boxes
# Welcome message
dlg_message("This program will merge one or more .csv files into one Excel file. When prompted, enter the path
where the .csv files are located. Sheet names in the Excel file will consist of a substring of the original filename.")$res
# Function to get user path
getPath <- function() {
# Ask for path
path <- dlgInput("Enter path to .csv files: ", Sys.info()["user"])$res
if (dir.exists(path)) {
# If it is, set the path as the working directory
setwd(path)
} else {
# If not, issue an error and recall the getPath function
dlg_message("Error: The path you entered is not a valid directory. Please try again.")$res
getPath()
}
}
# Call getPath function
folder <- getPath()
# Create and load Excel file
wb <- loadWorkbook("Combined.xlsx", create=TRUE)
# Get list of csv files in directory
pattern.ext <- "\\.csv$"
files <- dir(folder, full=TRUE, pattern=pattern.ext)
# Use substring of file names for sheet names (Excel limit) and remove extension
files.nms <- substr(basename(files),9,39)
files.nms <- gsub(pattern.ext, "", files.nms)
# Set the names
names(files) <- files.nms
# Iterate over each .csv and output to Excel sheet
for (nm in files.nms) {
# Read in .csv files
df <- fread(files[nm])
# Create the sheet and name as substr of file name
createSheet(object = wb, name = nm)
# Writes contents of the .csv To Excel
writeWorksheet(object = wb, data = df, sheet = nm, header = TRUE, rownames = NULL)
# Create a custom anonymous cell style
cs <- createCellStyle(wb)
# Wrap text
setWrapText(object = cs, wrap = TRUE)
# Set column width
setColumnWidth(object = wb, sheet = nm, column = 1:50, width = -1)
}
saveWorkbook(wb)
# Check to see if Excel file exists and is greater than default file size
if (file.exists("Combined.xlsx") & file.size("Combined.xlsx") > 8731) {
dlg_message("Your Excel file has been created.")$res
} else {
dlg_message("Error: Your file may not have been created or compelted properly. Please verify and try again if necessary.")$res
}

Have you considered using openxlsx?
It's not java dependent and it's pretty feature full.
# Load package
library(openxlsx)
# Create workbook
wb <- createWorkbook()
# Define data.frames you want to write (adapt to your scenario)
df <- c("mtcars", "iris")
# Loop over the length of the dataframes defined above
for(i in 1:length(df)){
# Create a work sheet and call it the numeric value
addWorksheet(wb, as.character(i))
# In the first row, first column, specify the df name
writeData(wb, i, df[i], startRow = 1, startCol = 1)
# Write the data.frame to the second row, first column
writeData(wb, i, eval(parse(text=df[i])), startRow = 2, startCol = 1)
}
# Save workbook
saveWorkbook(wb, "eg.xlsx", overwrite = TRUE)

Related

Corrupted/overwritten excel file with write_xlsx

I am using this code in R to change the name of the fourth column of an excel file (thanks to this: Change column name with file name of corresponding excel file), however the problem is that at the end the file is overwritten and it generates a corrupted excel file (Excel file format and extension don’t match). How it is possible to create a new excel file a the end (not overwritten) or do you have any other solution to not create a corrupted file?
filenames <- list.files(pattern = '\\.xlsx', full.names = TRUE)
lapply(filenames, function(x) {
#Read the data
data <- readxl::read_excel(x)
#Change the 4th column with filename
names(data)[4] <- tools::file_path_sans_ext(basename(x))
#Write the data back
writexl::write_xlsx(data, x)
})
You can use tools::file_path_sans_ext and tools:.file_ext to put together a new output filename. In the code below, I append the suffix _new to the file name before the file extension.
filenames <- list.files(pattern = '\\.xlsx', full.names = TRUE)
lapply(filenames, function(x) {
#Read the data
data <- readxl::read_excel(x)
#Change the 4th column with filename
y <- tools::file_path_sans_ext(basename(x))
names(data)[4] <- y
#Write the data back
ext <- tools::file_ext(x)
y <- paste0(y, "_new.", ext)
writexl::write_xlsx(data, y)
})

Read second sheet of xlsx file from various subdirectories of a main directory R

I want to read the sheet that contains the word "All"or "all" of an excel workbook for every subdirectory based on a specific pattern.
I have tried list.files() but it does not work properly.
files_to_read = list.files(
path = common_path, # directory to search within
pattern = "X - GEN", # regex pattern, some explanation below
recursive = TRUE, # search subdirectories
full.names = TRUE # return the full path
)
data_lst = lapply(files_to_read, read.xlsx)
I am assuming your sub-directories have a similar name that can be identifiable?
Assumption, let's say:
your sub-directory starts with 'this' and
the files that are saved in sub-directory starts with the file name 'my_file'
the tab that you are trying to read in contains the word 'all'.
If the tab that you are reading in is located in same position (e.g. 2nd tab of every file) then it is easier as you can specify the sheet within read.xlsx as sheet = 2 but if this is not the case then one way you could do is by creating your own function that allows for this.
Then
library(openxlsx)
# getting the name of subdirectories starting with the word 'this'
my_dir <- list.files(pattern = "^this", full.names = TRUE)
# getting the name of the files starting with 'my_file', e.g. my_file.xlsx, my_file2.xlsx
my_files <- list.files(my_dir, pattern = "^my_file", full.names = TRUE)
my_read_xlsx <- function(files_to_read, sheets_to_read) {
# files to import
wb <- loadWorkbook(files_to_read)
# getting the sheet names that contain 'all' or any other strings that you specify
# ignore.case is there so that case is not sensitive when reading in excel tabs
ws <- names(wb)[grepl(sheets_to_read, names(wb), ignore.case = TRUE)]
# reading in the excel tab specified as above
xl_data <- read.xlsx(wb, ws)
return(xl_data)
}
# Using the function created above and import tabs containing 'all'
my_list <- lapply(my_files, FUN = function(x) my_read_xlsx(x, sheet = "ALL"))
# Converting the list into a data.frame
my_data <- do.call("rbind", my_list)

Writing a new column in all excel files (and all sheets in an excel file) in a folder using R

So, I have many excel files in a folder, and each file has multiple sheets. If the name of the excel file is 'xyz', I want each sheet of each excel file to contain a 'new_column' such that each row of the new column will contain the excel file name (in this example, 'xyz').
Is there any direct way to do that? I would prefer to directly alter the files in the folder without creating new dataframes within rstudio.
Thanks.
You can use double lapply -
library(readxl)
library(writexl)
#Get a vector of xlsx filenames
filenames <- list.files(pattern = '.xlsx', full.names = TRUE)
lapply(filenames, function(x) {
#Read the sheet names
sheetname <- excel_sheets(x)
#For each sheet read the data and create list of dataframe
lapply(sheetname, function(y) {
cbind(read_xlsx(x, y), filename = x)
}) -> res
#Assign names to the list
names(res) <- sheetname
#Write the data back
write_xlsx(res, x)
})

Merge multiple Excel files starting at row in R

I have multiple Excel files that I need to merge into one, but only certain rows. The Excel files look like this...
The column headers are identical for all files. I also need to add a new column A to the newly generated file, so I created a separate Excel file with just the headers and the new column A. My script first reads in this file (below) and writes it to the workbook...
Next, I need to read each file, starting at row 9 and merge all the data, one after another. So the final result should look like this (minus the Member site column, I haven't attempted the logic for that yet, but thinking it will be a substring of the Specimen ID value)...
However, my current result is...
I am currently only using 3 files, each with a few dozen rows, to start, but the end goal is to merge about 15-30 files, each with 25 to 200 rows, give or take. So...
1) I know my code is incorrect, but not sure how to get the intended results. For one, my loop is overwriting data because it's constantly starting at row/column 2 when it writes. However, I can't think how to rewrite this.
2) The dates are returning in General format ("43008" instead of "9/30/2017")
3) Certain columns data is being placed under different columns (like Nucleic Acid Concentration has the values from the Date of Tissue Content).
Any advice or help would be greatly appreciated!
My code...
library(openxlsx) # Excel and csv files
library(svDialogs) # Dialog boxes
setwd("C:/Users/Work/Combined Manifest")
# Create and load Excel file
wb <- createWorkbook()
# Add worksheet
addWorksheet(wb, "Template")
# Read in & write header file
df.headers <- read.xlsx("headers.xlsx", sheet = "Template")
writeData(wb, "Template", df.headers, colNames = TRUE)
# Function to get user path
getPath <- function() {
# Ask for path
path <- dlgInput("Enter path to files: ", Sys.info()["user"])$res
if (dir.exists(path)) {
# If path exists, set the path as the working directory
return(path)
} else {
# If not, issue an error and recall the getPath function
dlg_message("Error: The path you entered is not a valid directory. Please try again.")$res
getPath()
}
}
# Call getPath function
folder <- getPath()
setwd(folder)
# Get list of files in directory
pattern.ext <- "\\.xlsx$"
files <- dir(folder, full=TRUE, pattern=pattern.ext)
# Get basenames and remove extension
files.nms <- basename(files)
files.nms <- gsub(pattern.ext, "", files.nms)
# Set the names
names(files) <- files.nms
# Iterate to read in files and write to new file
for (nm in files.nms) {
# Read in files
df <- read.xlsx((files[nm]), sheet = "Template", startRow = 9, colNames = FALSE)
# Write data to sheet
writeData(wb, "Template", df, startCol = 2, startRow = 2, colNames = FALSE)
}
saveWorkbook(wb, "Combined.xlsx", overwrite = TRUE)
EDIT:
So with the loop below, I am successfully reading in the files and merging them. Thanks for all the help!
for (nm in files.nms) {
# Read in files
df <- read.xlsx(files[nm], sheet = "Template", startRow = 8, colNames = TRUE, detectDates = TRUE, skipEmptyRows = FALSE,
skipEmptyCols = FALSE)
# Append the data
allData <- rbind(allData, df)
}
EDIT: FINAL SOLUTION
Thanks to everyone for the help!!
library(openxlsx) # Excel and csv files
library(svDialogs) # Dialog boxes
# Create and load Excel file
wb <- createWorkbook()
# Add worksheet
addWorksheet(wb, "Template")
# Function to get user path
getPath <- function() {
# Ask for path
path <- dlgInput("Enter path to files: ", Sys.info()["user"])$res
if (dir.exists(path)) {
# If path exists, set the path as the working directory
return(path)
} else {
# If not, issue an error and recall the getPath function
dlg_message("Error: The path you entered is not a valid directory. Please try again.")$res
getPath()
}
}
# Call getPath function
folder <- getPath()
# Set working directory
setwd(folder)
# Get list of files in directory
pattern.ext <- "\\.xlsx$"
files <- dir(folder, full=TRUE, pattern=pattern.ext)
# Get basenames and remove extension
files.nms <- basename(files)
# Set the names
names(files) <- files.nms
# Create empty dataframe
allData <- data.frame()
# Create list (reserve memory)
f.List <- vector("list",length(files.nms))
# Look and load files
for (nm in 1:length(files.nms)) {
# Read in files
f.List[[nm]] <- read.xlsx(files[nm], sheet = "Template", startRow = 8, colNames = TRUE, detectDates = TRUE, skipEmptyRows = FALSE,
skipEmptyCols = FALSE)
}
# Append the data
allData <- do.call("rbind", f.List)
# Add a new column as 'Member Site'
allData <- data.frame('Member Site' = "", allData)
# Take the substring of the Specimen.ID column for Memeber Site
allData$Member.Site <- sapply(strsplit(allData$Specimen.ID, "-"), "[", 2)
# Write data to sheet
writeData(wb, "Template", startCol = 1, allData)
# Save workbook
saveWorkbook(wb, "Combined.xlsx", overwrite = TRUE)
First of all, you are providing a lot of information in your question, which is generally a good thing, but I’m wondering if you could make your problems easier to solve by recreating the problem using fewer and smaller files. Could you figure out how to merge two files, each containing a small amount of data first?
With regards to the first challenge you raise:
1) Yes you are overwriting the workbook in each loop. I would suggest you load the data and append it to a data.frame and then store the end result after loading all the files. Have a look at the example below. Please note that this example uses rbind, which is inefficient if you are combining a large number of files. So if you have many files you may need to use a different structure.
# Create and empty data frame
allData <- data.frame()
# Loop and load files
for(nm in files.nms) {
# Read in files
df <- read.xlsx((files[nm]), sheet = "Template", startRow = 9, colNames = FALSE)
# Append the data
allData <- rbind(allData, df)
}
# Write data to sheet
writeData(wb, "Template", df, startCol = 2, startRow = 2, colNames = FALSE)
Hopefully this gets you closer to what you need!
Edit: Updating the answer to address the comments made
If you have more then a few files, rbind will get slow like #Parfait mentioned due to multiple copies of the data being made. The way to avoid this, is by first reserving space in memory by creating an empty list with enough space to hold your data, then fill in the list, and only at the end merge all the data together using do.call("rbind", ...) . I've compiled some sample code below that's in line with what you provided in your question.
# Create list (reserve memory)
f.List <- vector("list",length(files.nms))
# Loop and load files
for(eNr in 1:length(files.nms)) {
# Read in files
f.List[[eNr]] <- read.xlsx((files.nms[eNr]), sheet = "Template", startRow = 9)
}
# Append the data
allData <- do.call("rbind", f.List)
Below to illustrate this further, a small reproducible example. It uses just a couple of data frames, but it illustrates the process of creating a list, populating that list, and merging the data as the last step.
# Sample data
df1 <- data.frame(x=1:3, y=3:1)
df2 <- data.frame(y=4:6, x=3:1)
df.List <- list(df1,df2)
# Create list
d.List <- vector("list",length(df.List))
# Loop and add data
for(eNr in 1:length(df.List)) {
d.List[[eNr]] <- df.List[[eNr]]
}
# Bind all at once
dfAll <- do.call("rbind", d.List)
print(dfAll)
Hope this help! Thanks!

Save different csv files as one excel workbook with multiple sheets

I am trying to use the xlsx package to put different csv files into one excel workbook with multiple sheets. I found a routine that should work but it is not working for me.
So I have different csv files:
S:/productivity/R/Results/2008.csv
S:/productivity/R/Results/2009.csv
S:/productivity/R/Results/2010.csv
S:/productivity/R/Results/2011.csv
S:/productivity/R/Results/2012.csv
My R codes look like:
# loading the library
library(xlsx)
rm(list = ls())
# getting the path of all csv files
myfiles = system("S:/productivity/R/Results",intern = TRUE)
wb <- createWorkbook()
# going through each csv file
for (item in myfiles) {
# create a sheet in the workbook
sheet <- createSheet(wb, sheetName=strsplit(item,"/")[[1]][5])
# add the data to the new sheet
addDataFrame(read.csv(item), sheet)
}
# saving the workbook
saveWorkbook(wb, "2008_2012.xlsx")
I receive the following error:
myfiles = system('"S:/productivity/R/Results"',intern = TRUE)
Error in system("\"S:/productivity/R/Results\"", intern = TRUE) :
'"S:/productivity/R/Results"' not found
Personally, I use XLConnect for these tasks.
The steps for writing to multiple sheets are:
create a new workbook
create the sheet
output the data to the sheet
save the sheet
--
SAMPLE CODE:
library(data.table) ## for fast fread() function
library(XLConnect)
folder <- "folder/where/CSV_files_are_located"
f.out <- "path/to/file.xlsx"
## load in file
wb <- loadWorkbook(f.out, create=TRUE)
## get all files
pattern.ext <- "\\.csv$"
files <- dir(folder, full=TRUE, pattern=pattern.ext)
## Grab the base file names, you can use them as the sheet names
files.nms <- basename(files)
files.nms <- gsub(pattern.ext, "", files.nms)
## set the names to make them easier to grab
names(files) <- files.nms
for (nm in files.nms) {
## ingest the CSV file
temp_DT <- fread(files[[nm]])
## Create the sheet where the file will be outputed to
createSheet(wb, name=nm)
## output the csv contents
writeWorksheet(object=wb, data=temp_DT, sheet=nm, header=TRUE, rownames=NULL)
}
saveWorkbook(wb)
if you would like to see your file
system(sprintf("open %s", dirname(f.out))) ## For the containing folder
system(sprintf("open %s", f.out)) ## for opening the file with default app, ie excel

Resources