Reading Excel file into R with operator in tab name - r

I am reading a file with different tabs into R. However, they changed the tab names so they contain operators now, which R doesnt seem to like. For instance (and this is where the code occurs) "Storico_G1" became "Storico_G+1".
I post the code below, but the error occurs early on. I am basically looking for a workaround/to change the tab names before i create data.frames.
NB I left the code as it was before they changed the tab name from "Storico_G1" to "Storico_G+1" as I think its easier to grasp this way.
Can anybody guide me in the right direction? Many thanks in advance!
library(ggplot2)
library(lubridate)
library(openxlsx)
library(reshape2)
library(dplyr)
library(scales)
Storico_G <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx",sheet = "Storico_G", startRow = 1, colNames = TRUE)
Storico_G1 <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx", startRow = 1, colNames = TRUE)
# Selecting Column C,E,R from Storico_G and stored in variable Storico_G_df
# Selecting Column A,P from Storico_G+1 and stored in variable Storico_G1_df
Storico_G_df <- data.frame(Storico_G$pubblicazione,Storico_G$IMMESSO, Storico_G$`RICONSEGNATO.(1)`, Storico_G$BILANCIAMENTO.RESIDUALE )
Storico_G1_df <- data.frame(Storico_G1$pubblicazione, Storico_G1$`SBILANCIAMENTO.ATTESO.DEL.SISTEMA.(SAS)`)
# Conerting pubblicazione in date format and time
Storico_G_df$pubblicazione <- ymd_h(Storico_G_df$Storico_G.pubblicazione)
Storico_G1_df$pubblicazione <- ymd_h(Storico_G1_df$Storico_G1.pubblicazione)
# Selecting on row which is having 4PM value in Storico_G+1 excel sheet tab
Storico_G1_df <- subset(Storico_G1_df, hour(Storico_G1_df$pubblicazione) == 16)
rownames(Storico_G1_df) <- 1:nrow(Storico_G1_df)
# Averaging hourly values to 1 daily data point in G excel sheet tab
Storico_G_df$Storico_G.pubblicazione <- strptime(Storico_G_df$Storico_G.pubblicazione, "%Y_%m_%d_%H")
storico_G_df_agg <- aggregate(Storico_G_df, by=list(day=format(Storico_G_df$Storico_G.pubblicazione, "%F")), FUN=mean, na.rm=TRUE)[,-2]
#cbind.fill function
cbind.fill <- function(...){
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
#cbind with both frames
G_G1_df= data.frame(cbind.fill(storico_G_df_agg,Storico_G1_df))
#keep required columns
keep=c("day", "Storico_G.IMMESSO","Storico_G..RICONSEGNATO..1..","Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..")
#update dataframe to kept variables
G_G1_df=G_G1_df[,keep,drop=FALSE]
#Rename crazy variable names
G_G1_df <- data.frame(G_G1_df) %>%
select(day, Storico_G.IMMESSO, Storico_G..RICONSEGNATO..1.., Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..)
names(G_G1_df) <- c("day", "Immesso","Riconsegnato", "SAS")
#Melt time series
G_G1_df=melt(G_G1_df,id.vars = "day")
#Create group variable
G_G1_df$group<- ifelse(G_G1_df$variable == "SAS", "SAS", "Immesso/Consegnato")
#plot
ggplot(G_G1_df, aes(as.Date(day),as.numeric(value),col=variable))+geom_point()+geom_line()+facet_wrap(~group,ncol=1,scales="free_y")+labs(x="Month", y="Values") +scale_x_date(labels=date_format("%m-%Y"))+geom_abline(intercept=c(-2,0,2),slope=0,data=subset(G_G1_df,group=="SAS"),lwd=0.5,lty=2)

Related

R: Iterating over a df to paste one slice at a time into MS Excel Spreadsheet using lapply and seq_along

PROBLEM: I am trying to automate data input into a legacy MS Excel spreadsheet that carries out calculations. I have figure out how to use openxlsx to do this one "slice" of data at a time. I would like to modify the code below to iterate over the entire dataset rather than doing this piece meal.
#NO ITERATION
#load libraries and define working directory
if(!require(tidyr)){install.packages("tidyr")}
if(!require(openxlsx)){install.packages("openxlsx")}
setwd("C:/R/Seq_along")
#Sample data frame
Site <- rep(letters[1:6],each=3)
Param <- as.factor(rep(c("X","Y","Z"),6))
set.seed(71)
Result <- sample(0:25, 18, replace = TRUE)
df <- data.frame(Site,Param,Result)
df
str(df)
#Pivot from long to wide format
df.long <- df %>% pivot_wider(names_from = Site, values_from = Result)
#Define list of working slices and workbook sheets
slicelist <- list(c(2:4),c(5:7))
#sheetlist <- list(c("Sheet1"),c("Sheet2"))
#Slice #1
#Slice df into first slice
slicecurrent <- slicelist[[1]]
#sheetcurrent <- sheetlist[[1]]
df1 <- df.long[,slicecurrent]
#Loadworkbook and write first slice into sheet 1 of workbook
wb1 <- loadWorkbook("test.xlsx")
class(wb1)
names(wb1)
writeData(wb1,sheet="Sheet1", x = df1, xy = c(2,6), colNames = FALSE, rowNames = FALSE)
saveWorkbook(wb1,"test.xlsx", overwrite = TRUE)
#openXL("test.xlsx")
#Slice #2
#Slice df into first slice
slicecurrent <- slicelist[[2]]
#sheetcurrent <- sheetlist[[2]]
df1 <- df.long[,slicecurrent]
#Loadworkbook and write first slice into sheet 1 of workbook
wb1 <- loadWorkbook("test.xlsx")
class(wb1)
names(wb1)
writeData(wb1,sheet="Sheet2", x = df1, xy = c(2,6), colNames = FALSE, rowNames = FALSE)
saveWorkbook(wb1,"test.xlsx", overwrite = TRUE)
openXL("test.xlsx")
Those data I am using need to be cut into "slices" (using 2 slices here) since the spreadsheet
can only handle a certain number of data columns at a time (in this case 3). The actual data to be pasted is onto the spreasheet is called, "df.long" and the test spreadsheet is a blank spreadsheet called, "test.xlsx", that is saved in the working directory.
Here is an image of the result I am after:
https://ibb.co/SJ0xG4Q
I have tried using lapply in combination with the seq_along function to specify the slices to use and iterate the process. However. rather than pasting the first chunk into sheet1 in the spreadsheet and the second chunk in sheet2 of the spreadsheet like in the no-iteration script, its iterating over all elements of the list and I end up with the same chunk being pasted on both sheet1 and sheet2 (as I have told it to do so).
#load libraries and define working directory
if(!require(tidyr)){install.packages("tidyr")}
if(!require(openxlsx)){install.packages("openxlsx")}
setwd("C:/R/Seq_along")
#Sample data frame
Site <- rep(letters[1:6],each=3)
Param <- as.factor(rep(c("X","Y","Z"),6))
set.seed(71)
Result <- sample(0:25, 18, replace = TRUE)
df <- data.frame(Site,Param,Result)
df
str(df)
#Pivot from long to wide format
df.long <- df %>% pivot_wider(names_from = Site, values_from = Result)
#Define list of working slices and workbook sheets
slicelist <- list(c(2:4),c(5:7))
#sheetlist <- list(c("Sheet1"),c("Sheet2"))
lapply(seq_along(slicelist),
function(i){
#Slice #1
#Slice df into first slice
#slicecurrent <- slicelist[[1]]
#sheetcurrent <- sheetlist[[1]]
df1 <- df.long[,slicelist[[i]]]
#Loadworkbook and write first slice into sheet 1 of workbook
wb1 <- loadWorkbook("test.xlsx")
class(wb1)
names(wb1)
writeData(wb1,sheet="Sheet1", x = df1, xy = c(2,6), colNames = FALSE, rowNames = FALSE)
saveWorkbook(wb1,"test.xlsx", overwrite = TRUE)
#openXL("test.xlsx")
#Slice #2
#Slice df into first slice
#slicecurrent <- slicelist[[2]]
#sheetcurrent <- sheetlist[[2]]
df1 <- df.long[,slicelist[[i]]]
#Loadworkbook and write first slice into sheet 1 of workbook
wb1 <- loadWorkbook("test.xlsx")
class(wb1)
names(wb1)
writeData(wb1,sheet="Sheet2", x = df1, xy = c(2,6), colNames = FALSE, rowNames = FALSE)
saveWorkbook(wb1,"test.xlsx", overwrite = TRUE)
})
openXL("test.xlsx")
Is there a way I can I replicate the result of the first script using lapply with the seq_along function or should I be using a for loop instead?
1: https://i.stack.imgur.com/XSmhT.jpg
2: https://i.stack.imgur.com/aPGjG.jpg
3: https://ibb.co/SJ0xG4Q
Either you do not know how to use lapply() or you do not know how to use openxlsx. Anyhow here is a cleaned up working example using only base R and openxlsx. In addition, for something like this I would use a simple for loop.
library(openxlsx)
#Sample data frame
Site <- rep(letters[1:6],each=3)
Param <- as.factor(rep(c("X","Y","Z"),6))
set.seed(71)
Result <- sample(0:25, 18, replace = TRUE)
df <- data.frame(Site,Param,Result)
#Pivot from long to wide format
df.long <- reshape(df, idvar = "Param", timevar = "Site", direction = "wide")
#Define list of working slices and workbook sheets
slicelist <- list(c(2:4),c(5:7))
# wb1 <- loadWorkbook("test.xlsx") # I do not have this file
wb1 <- createWorkbook("test.xlsx")
out <- lapply(
seq_along(slicelist),
function(i) {
dfi <- df.long[,slicelist[[i]]]
sheet <- paste0("Sheet", i)
addWorksheet(wb1, sheet) # I add the sheet. Remove this line if you have the sheets
writeData(wb1, sheet = sheet, x = dfi, xy = c(2,6), colNames = FALSE, rowNames = FALSE)
})
saveWorkbook(wb1,"test.xlsx", overwrite = TRUE)
openXL("test.xlsx")

Rbinding large list of dataframes after I did some data cleaning on the list

My problem is, that I can't merge a large list of dataframes before doing some data cleaning. But it seems like my data cleaning is missing from the list.
I have 43 xlsx-files, which I've put in a list.
Here's my code for that part:
file.list <- list.files(recursive=T,pattern='*.xlsx')
dat = lapply(file.list, function(i){
x = read.xlsx(i, sheet=1, startRow=2, colNames = T,
skipEmptyCols = T, skipEmptyRows = T)
# Create column with file name
x$file = i
# Return data
x
})
I then did some datacleaning. Some of the dataframes had some empty columns that weren't skipped in the loading and some columns I just didn't need.
Example of how I removed one column (X1) from all dataframes in the list:
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
I also applies column names:
colnames <- c("ID", "UDLIGNNR","BILAGNR", "AKT", "BA",
"IART", "HTRANS", "DTRANS", "BELOB", "REGD",
"BOGFD", "AFVBOGFD", "VALORD", "UDLIGND",
"UÅ", "AFSTEMNGL", "NRBASIS", "SPECIFIK1",
"SPECIFIK2", "SPECIFIK3", "PERIODE","FILE")
dat <- lapply(dat, setNames, colnames)
My problem is, when I open the list or look at the elements in the list, my data cleaning is missing.
And I can't bind the dataframes before the data cleaning since they're aren't looking the same.
What am I doing wrong here?
EDIT: Sample data*
# Sample data
a <- c("a","b","c")
b <- c(1,2,3)
X1 <- c("", "","")
c <- c("a","b","c")
X2 <- c(1,2,3)
X1 <- c("", "","")
df1 <- data.frame(a,b,c,X1)
df2 <- data.frame(a,b,c,X1,X2)
# Putting in list
dat <- list(df1,df2)
# Removing unwanted columns
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
dat <- lapply(dat, function(x) { x["X2"] <- NULL; x })
# Setting column names
colnames <- c("Alpha", "Beta", "Gamma")
dat <- lapply(dat, setNames, colnames)
# Merging dataframes
df <- do.call(rbind,dat)
So I've just found that with my sample data this goes smoothly.
I had to reopen the list in View-mode to see the changes I made. That doesn't change the fact that when writing to csv and reopening all the data cleaning is missing (haven'tr tried this with my sample data).
I am wondering if it's because I've changed the merge?
# My merge when I wrote this question:
df <- do.call("rbindlist", dat)
# My merge now:
df <- do.call(rbind,dat)
When I use my real data it doesnøt go as smoothly, so I guess the sample data is bad. I don't know what I'm doing wrong so I can't give some better sample data.
The message I get when merging with rbind:
error in rbind(deparse.level ...) numbers of columns of arguments do not match

Import multiple xlsx file in R

I have several xlsx files in a directory with the same structure (i.e. column A,B,C); every file is the data of one day.
I need to import all the data in R and find the differences between one day and the next one.
files <- list.files(pattern = ".xlsx")
for (i in seq_along(files)) {
assign(paste("Day", i, sep = "."), read.xlsx(files[i]))
}
I can't figure out how to use the imported data.
For example
Day.1 <- data.frame(Day.1)
Day.1$A <- as.character(Day.1$A)
Day.2 <- data.frame(Day.2)
Day.2$A <- as.character(Day.2$A)
anti_join (Day.1, Day.2)
This code works fine but how should it be with a variable?
Day.[i] <- data.frame(Day.[i])
Day.[i]$A <- as.character(Day.[i]$A)
Day.[i+1] <- data.frame(Day.[i+1])
Day.[i+1]$A <- as.character(Day.[i+1]$A)
anti_join (Day.[i], Day.[i+1])
I tried to import all the files in a single data frame but I have a similar problem about how to use the new data
file.list <- list.files(pattern='*.xlsx')
days.list <- lapply(file.list, read_excel)
days <- rbindlist(days.list, idcol = "id")
days <- data.frame(days)
days$B <- as.character(days$B)
But I don't know how to do something like:
day1 <- filter(days, id==1)
day2 <- filter(days, id==2)
diff1 <- anti_join (day1, day2, by=c("B", "C"))
using a counter variable (i)
day(i) <- filter(days, id==(i))
day(i+1) <- filter(days, id==(i+1))
diff1 <- anti_join (day1, day2, by=c("B", "C"))
Consider using base R's Map (wrapper to mapply) between a dataframe list of (days) and (days + 1), respectively the left and right sides of dplyr::anti_join. Of course the very last day will not have a forward day comparison.
library(xlsx)
library(dplyr)
file.list <- list.files(pattern='*.xlsx')
df.list <- lapply(file.list, function(f){
read.xlsx(f, 1, stringsAsFactors = FALSE)
})
left_days <- df.list[1:length(df.list)-1] # SUBSET OUT LAST DAY
right_days <- df.list[2:length(df.list)] # SUBSET OUT FIRST DAY
# WITHOUT ARGS
anti_join_list <- Map(anti_join, left_days, right_days)
# WITH ARGS
anti_join_list <- Map(function(x,y) anti_join(x, y, by=c("B", "C")), left_days, right_days)

How to read and use the dataframes with the different names in a loop?

I'm struggling with the following issue: I have many data frames with different names (For instance, Beverage, Construction, Electronic etc., dim. 540x1000). I need to clean each of them, calculate and save as zoo object and R data file. Cleaning is the same for all of them - deleting the empty columns and the columns with some specific names.
For example:
Beverages <- Beverages[,colSums(is.na(Beverages))<nrow(Beverages)] #removing empty columns
Beverages_OK <- Beverages %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
Beverages_OK[, 1] <- NULL #dropping the first column
Beverages_OK <- cbind(data[1], Beverages_OK) # adding a date column
Beverages_zoo <- read.zoo(Beverages_OK, header = FALSE, format = "%Y-%m-%d")
save (Beverages_OK, file = "StatisticsInRFormat/Beverages.RData")
I tied to use 'lapply' function like this:
list <- ls() # the list of all the dataframes
lapply(list, function(X) {
temp <- X
temp <- temp [,colSums(is.na(temp))< nrow(temp)] #removing empty columns
temp <- temp %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
temp[, 1] <- NULL
temp <- cbind(data[1], temp)
X_zoo <- read.zoo(X, header = FALSE, format = "%Y-%m-%d") # I don't know how to have the zame name as X has.
save (X, file = "StatisticsInRFormat/X.RData")
})
but it doesn't work. Is any way to do such a job? Is any r-package that facilitates it?
Thanks a lot.
If you are sure the you have only the needed data frames in the environment this should get you started:
df1 <- mtcars
df2 <- mtcars
df3 <- mtcars
list <- ls()
lapply(list, function(x) {
tmp <- get(x)
})

merge data frames based on non-identical values in R

I have two data frames. First one looks like
dat <- data.frame(matrix(nrow=2,ncol=3))
names(dat) <- c("Locus", "Pos", "NVAR")
dat[1,] <- c("ACTC1-001_1", "chr15:35087734..35087734", "1" )
dat[2,] <- c("ACTC1-001_2 ", "chr15:35086890..35086919", "2")
where chr15:35086890..35086919 indicates all the numbers within this range.
The second looks like:
dat2 <- data.frame(matrix(nrow=2,ncol=3))
names(dat2) <- c("VAR","REF.ALT"," FUNC")
dat2[1,] <- c("chr1:116242719", "T/A", "intergenic" )
dat2[2,] <- c("chr1:116242855", "A/G", "intergenic")
I want to merge these by the values in dat$Pos and dat2$VAR. If the single number in a cell in dat2$VAR is contained within the range of a cell in dat$Pos, I want to merge those rows. If this occurs more than once (dat2$VAR in more than one range in dat$Pos, I want it merged each time). What's the easiest way to do this?
Here is a solution, quite short but not particularly efficient so I would not recommend it for large data. However, you seemed to indicate your data was not that large so give it a try and let me know:
library(plyr)
exploded.dat <- adply(dat, 1, function(x){
parts <- strsplit(x$Pos, ":")[[1]]
chr <- parts[1]
range <- strsplit(parts[2], "..", fixed = TRUE)[[1]]
start <- range[1]
end <- range[2]
data.frame(VAR = paste(chr, seq(from = start, to = end), sep = ":"), x)
})
merge(dat2, exploded.dat, by = "VAR")
If it is too slow or uses too much memory for your needs, you'll have to implement something a bit more complex and this other question looks like a good starting point: Merge by Range in R - Applying Loops.
Please try this out and let us know how it works. Without a larger data set it is a bit hard to trouble shoot. If for whatever reason it does not work, please share a few more rows from your data tables (specifically ones that would match)
SPLICE THE DATA
range.strings <- do.call(rbind, strsplit(dat$Pos, ":"))[, 2]
range.strings <- do.call(rbind, strsplit(range.strings, "\\.\\."))
mins <- as.numeric(range.strings[,1])
maxs <- as.numeric(range.strings[,2])
d2.vars <- as.numeric(do.call(rbind, str_split(dat2$VAR, ":"))[,2])
names(d2.vars) <- seq(d2.vars)
FIND THE MATCHES
# row numebr is the row in dat
# col number is the row in dat2
matches <- sapply(d2.vars, function(v) mins < v & v <= maxs)
MERGE
# create a column in dat to merge-by
dat <- cbind(dat, VAR=NA)
# use the VAR in dat2 as the merge id
sapply(seq(ncol(matches)), function(i)
dat$VAR <- dat2[i, "VAR"] )
merge(dat, dat2)

Resources