I have a function that takes two arguments, a link and and an State name abbreviation. The function gets a spreadsheet from a remote site and
makes me a data frame of the state data that I want.
library(lubridate)
library(tidyr)
library(gdata)
library(dplyr)
options(stringsAsFactors = FALSE)
nsw_link <- 'http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101051.xls&3101.0&Time%20Series%20Spreadsheet&3F92BFA30BC29940CA257F1D001427C3&0&Jun%202015&17.12.2015&Latest'
qld_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101053-.xls&3101.0&Time%20Series%20Spreadsheet&2927EBD7E6856BABCA257F1D0014283D&0&Jun%202015&17.12.2015&Latest"
vic_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101052.xls&3101.0&Time%20Series%20Spreadsheet&E3B2958632AB29ECCA257F1D001427FB&0&Jun%202015&17.12.2015&Latest"
nt_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101057.xls&3101.0&Time%20Series%20Spreadsheet&CCB60AB638D60938CA257F1D0014291C&0&Jun%202015&17.12.2015&Latest"
tas_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101056.xls&3101.0&Time%20Series%20Spreadsheet&8CA5625A306A4805CA257F1D001428D7&0&Jun%202015&17.12.2015&Latest"
act_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101058.xls&3101.0&Time%20Series%20Spreadsheet&4234206BA89A82F6CA257F1D00142959&0&Jun%202015&17.12.2015&Latest"
sa_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101054.xls&3101.0&Time%20Series%20Spreadsheet&A342BFFB06A62F4FCA257F1D0014286D&0&Jun%202015&17.12.2015&Latest"
wa_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101055.xls&3101.0&Time%20Series%20Spreadsheet&DC6208699BE4D2FFCA257F1D0014289E&0&Jun%202015&17.12.2015&Latest"
get_ERP_data <- function(link, state){
# Get the xls file and slice only the columns needed for male and female ERPs
xls_data <- read.xls(link, sheet = 'Data1')
xls_data <- tbl_df(xls_data)
xls_data <- xls_data[, 1:203]
names(xls_data) <- gsub(pattern = "Estimated.Resident.Population....", '', names(xls_data))
names(xls_data) <- gsub('[.]+', '', names(xls_data))
names(xls_data) <- gsub('100andover', '101', names(xls_data))
names(xls_data) <- gsub("(\\d+)$", ".\\1", names(xls_data))
xls_data <- xls_data[28:54, ]
names(xls_data)[1] <- 'Year'
xls_data$Year <- paste('01', xls_data$Year, sep = '-')
xls_data$Year <- dmy(xls_data$Year)
xls_data$Year <- year(xls_data$Year)
# make a long version of the ERP data
xls_data_long <- xls_data %>% gather(Sex_Age, Population, Male.0:Female.101)
# Make two new column, sex and age, from the sex_age column
xls_data_sep_log <- xls_data_long %>% separate(Sex_Age, c('Sex', 'Age'))
# Recode observations from 101 to 'over 100'
xls_data_sep_log$Age[grep(xls_data_sep_log$Age, pattern = '101')] <- 'Over 100'
xls_data_sep_log$State <- state
df_name <- paste(state, 'data', sep = '_')
assign(df_name,xls_data_sep_log,envir = .GlobalEnv)
}
link_list <- list(act_link, nsw_link, nt_link, qld_link, sa_link, tas_link, vic_link, wa_link)
states <- c('ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA')
For example, if I want Queensland estimated resident population data I run:
get_ERP_data(qld_link, 'QLD')
This makes me a longitudinal data frame of the data I need.
I would like to extend this so that I can pass a vector or list of states to a list of links as parameter and return a data frame for each
argument in the paramter? I am not particularly concerned with binding resulting data frames in the function output.
Similar to the example above, how could I get my function to run something like:
get_ERP_data(as.list(qld_link, nsw_link), c('QLD', 'NSW'))
I tried using variations of sapply and do.call but did not get very far. Is there a way to use some equivalent of pythons *args/**kwargs
to pass lists as arguments to my function?
Any help would be appreciated.
Related
I have a dataset dt, it stored list dataset names, I need to use them to create some new datasets with select some variables, then I use the dataset I just created, repeat the same process .....
The first row and second row were data available.
Then use data available to create a new data.
Then use data just create to create a new data
The final output was list of datasets
I appreciated any helps or suggestions.
dt <- data.frame(name = c("mtcars","iris", "mtcars_new","mtcars_new_1"),
data_source = c("mtcars","iris", "mtcars","mtcars_new"),
variable = c("","","mpg,cyl,am,hp","mpg,cyl"), stringsAsFactors = FALSE)
> dt
name data_source variable
1 mtcars mtcars
2 iris iris
3 mtcars_new mtcars mpg,cyl,am,hp
4 mtcars_new_1 mtcars_new mpg,cyl
dt_list <- list(mtcars, iris)
names(dt_list ) <- c("mtcars","iris")
# The final list of datasets
final_dt <- list(mtcars, iris, mtcars_new, mtcars_new_1)
So far if I wrote a loop like that, I got only mtcars_new dataset, but I don't know how to return to the list and continue looping to get mtcars_new_1 and so on. I have many datasets, and I don't know how many times I should looping through nested data.
mtcars_new <- data.frame()
for(i in 1:nrow(dt)){
if(dt$data_source[[i]] %in% names(dt_list) && !dt$name[[i]] %in% names(dt_list)){
check <- eval(parse(text = dt$data_source[[i]]))
var <- c(unlist(strsplit(dt$variable[[i]],",")))
mtcars_new <- check[, colnames(check) %in% var]
}
}
This will produce the desired output shown. Since the fourth loop uses the data created in the third loop, you need to have a way to append the results of each loop to a growing list of available data sets. Then within each loop find which one is the right starting data set from the available list.
dt <- data.frame(name = c("mtcars","iris", "mtcars_new","mtcars_new_1"),
data_source = c("mtcars","iris", "mtcars","mtcars_new"),
variable = c("","","mpg,cyl,am,hp","mpg,cyl"), stringsAsFactors = FALSE)
input_data_sets <- list(mtcars, iris)
names(input_data_sets) <- c("mtcars","iris")
final_data_sets <- list()
for(i in 1:nrow(dt)) {
available_data_sets <- c(input_data_sets, final_data_sets) #Grows a list of all available data sets
num_to_use <- which(dt$data_source[[i]] == names(available_data_sets)) #finds the right list member to use
temp <- available_data_sets[num_to_use][[1]]
var <- c(unlist(strsplit(dt$variable[[i]],",")))
temp <- list(subset(temp, select = var)) #keep only the desired variables
names(temp) <- dt$name[i] #assign the name provided
final_data_sets <- c(final_data_sets, temp) #add to list of final data sets which will be the output. Anything listed here will become part of the available list in the next loop
}
I am trying to use a for loop to multiple create dataframes. The original code works fine for a single run (without the for loop).
object<- c(1,2,3)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
N.rows <- length(object)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(df1,df2,df3))
When I do this I get combined[3] but not the outputs from the two other variables in my object. I have toyed around with it and managed to get as a result combined1, but again not a list with combined1, combined[2], and combined[3].
UPDATE: I was asked for the concrete example and expected output.
I'm basically taking three CSV files of county census data but organized differently (two with years as rows, and one with years as columns), transforming the data into a consistent format by county and then combining the files.
The above image is the result of View(combine). [[2]] is just what I want, but nothing is stored in [1].
This is the code that I used to get to it:
pop1990.2000 <- read.csv("1990-2000 Census Pop.csv",
stringsAsFactors = FALSE)
pop2000.2010 <- read.csv("2000-2010 Census Pop.csv",
stringsAsFactors = FALSE)
pop2010.2019 <- read.csv("2010-2019 Census Pop.csv",
stringsAsFactors = FALSE)
#Adding Total column "Population"
pop1990.2000$Population <- (rowSums(pop1990.2000) -
pop1990.2000$Year -
pop1990.2000$FIPS.Code)
#Combining State and County FIPS codes "FIPS.Code"
pop2000.2010$FIPS.Code <- (pop2000.2010$STATE*1000+
pop2000.2010$COUNTY)
pop2010.2019$FIPS.Code <- (pop2010.2019$STATE*1000+
pop2010.2019$COUNTY)
my_counties<-c(1125, 1127)
for (i in 1:length(my_counties)) {
#Selecting Pop data for County 1125 for 1990-2000
newdata <- pop1990.2000[ which(pop1990.2000$FIPS.Code==my_counties[i]), ]
newdata2000v1 <- as.data.frame(cbind(Year=newdata$Year,
Population=newdata$Pop))
#Adding FIPs Code
newdata2000v1$FIPS.Code<-my_counties[i]
#Merging County Name by FIPS.Code
pop2000.2010.c.fips <- pop2000.2010 %>%
select(FIPS.Code, CTYNAME)
pop2000.2010.c.fips$County<-pop2000.2010.c.fips$CTYNAME
newdata2000v1 <- newdata2000v1 %>%
mutate(FIPS.Code = as.numeric(FIPS.Code))
newdata2000 <- left_join(newdata2000v1,
pop2000.2010.c.fips,
by = "FIPS.Code")
newdata2000<-newdata2000 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2000-2010
newdata2 <- pop2000.2010[ which(pop2000.2010$FIPS.Code==my_counties[i]), ]
newdata2010 <- cbind("2000"=newdata2$ESTIMATESBASE2000,
"2001"=newdata2$POPESTIMATE2001,
"2002"=newdata2$POPESTIMATE2002,
"2003"=newdata2$POPESTIMATE2003,
"2004"=newdata2$POPESTIMATE2004,
"2005"=newdata2$POPESTIMATE2005,
"2006"=newdata2$POPESTIMATE2006,
"2007"=newdata2$POPESTIMATE2007,
"2008"=newdata2$POPESTIMATE2008,
"2009"=newdata2$POPESTIMATE2009)
newdata2010<-as.data.frame(t(newdata2010))
newdata2010$County<-newdata2$CTYNAME
newdata2010$FIPS.Code<-newdata2$FIPS.Code
newdata2010$Year<-c(rownames(newdata2010))
names(newdata2010)[names(newdata2010) == 'V1'] <- 'Population'
newdata2010<-newdata2010 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2010-2019
newdata3 <- pop2010.2019[ which(pop2010.2019$FIPS.Code==my_counties[i]), ]
newdata2019 <- cbind(Year=newdata3$Year,
"2010"=newdata3$CENSUS2010POP,
"2011"=newdata3$POPESTIMATE2011,
"2012"=newdata3$POPESTIMATE2012,
"2013"=newdata3$POPESTIMATE2013,
"2014"=newdata3$POPESTIMATE2014,
"2015"=newdata3$POPESTIMATE2015,
"2016"=newdata3$POPESTIMATE2016,
"2017"=newdata3$POPESTIMATE2017,
"2018"=newdata3$POPESTIMATE2018,
"2019"=newdata3$POPESTIMATE2019)
newdata2019<-as.data.frame(t(newdata2019))
newdata2019$County<-newdata3$CTYNAME
newdata2019$FIPS.Code<-newdata3$FIPS.Code
newdata2019$Year<-c(rownames(newdata2019))
names(newdata2019)[names(newdata2019) == 'V1'] <- 'Population'
newdata2019<-newdata2019 %>% select(County, FIPS.Code, Year, Population)
N.rows <- length(my_counties)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(newdata2000,newdata2010,newdata2019))
The problem is that you are re-specifying the creation of the combined object.
I am not sure what exactly your var1 is, but, possibly the following should work:
object<- c(1,2,3)
N.rows <- length(var1)
combined <- vector("list", N.rows)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
combined[i]<-list(rbind(df1,df2,df3))
}
Alternatively, using lapply:
object<- c(1,2,3)
combined<-lapply(object, function(i){
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
list(rbind(df1,df2,df3))
}
But this will deliver the list of length 3 (with three lists with df1,df2 and df3), not the length defined by the length of var1...
My problem is, that I can't merge a large list of dataframes before doing some data cleaning. But it seems like my data cleaning is missing from the list.
I have 43 xlsx-files, which I've put in a list.
Here's my code for that part:
file.list <- list.files(recursive=T,pattern='*.xlsx')
dat = lapply(file.list, function(i){
x = read.xlsx(i, sheet=1, startRow=2, colNames = T,
skipEmptyCols = T, skipEmptyRows = T)
# Create column with file name
x$file = i
# Return data
x
})
I then did some datacleaning. Some of the dataframes had some empty columns that weren't skipped in the loading and some columns I just didn't need.
Example of how I removed one column (X1) from all dataframes in the list:
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
I also applies column names:
colnames <- c("ID", "UDLIGNNR","BILAGNR", "AKT", "BA",
"IART", "HTRANS", "DTRANS", "BELOB", "REGD",
"BOGFD", "AFVBOGFD", "VALORD", "UDLIGND",
"UÅ", "AFSTEMNGL", "NRBASIS", "SPECIFIK1",
"SPECIFIK2", "SPECIFIK3", "PERIODE","FILE")
dat <- lapply(dat, setNames, colnames)
My problem is, when I open the list or look at the elements in the list, my data cleaning is missing.
And I can't bind the dataframes before the data cleaning since they're aren't looking the same.
What am I doing wrong here?
EDIT: Sample data*
# Sample data
a <- c("a","b","c")
b <- c(1,2,3)
X1 <- c("", "","")
c <- c("a","b","c")
X2 <- c(1,2,3)
X1 <- c("", "","")
df1 <- data.frame(a,b,c,X1)
df2 <- data.frame(a,b,c,X1,X2)
# Putting in list
dat <- list(df1,df2)
# Removing unwanted columns
dat <- lapply(dat, function(x) { x["X1"] <- NULL; x })
dat <- lapply(dat, function(x) { x["X2"] <- NULL; x })
# Setting column names
colnames <- c("Alpha", "Beta", "Gamma")
dat <- lapply(dat, setNames, colnames)
# Merging dataframes
df <- do.call(rbind,dat)
So I've just found that with my sample data this goes smoothly.
I had to reopen the list in View-mode to see the changes I made. That doesn't change the fact that when writing to csv and reopening all the data cleaning is missing (haven'tr tried this with my sample data).
I am wondering if it's because I've changed the merge?
# My merge when I wrote this question:
df <- do.call("rbindlist", dat)
# My merge now:
df <- do.call(rbind,dat)
When I use my real data it doesnøt go as smoothly, so I guess the sample data is bad. I don't know what I'm doing wrong so I can't give some better sample data.
The message I get when merging with rbind:
error in rbind(deparse.level ...) numbers of columns of arguments do not match
I am reading a file with different tabs into R. However, they changed the tab names so they contain operators now, which R doesnt seem to like. For instance (and this is where the code occurs) "Storico_G1" became "Storico_G+1".
I post the code below, but the error occurs early on. I am basically looking for a workaround/to change the tab names before i create data.frames.
NB I left the code as it was before they changed the tab name from "Storico_G1" to "Storico_G+1" as I think its easier to grasp this way.
Can anybody guide me in the right direction? Many thanks in advance!
library(ggplot2)
library(lubridate)
library(openxlsx)
library(reshape2)
library(dplyr)
library(scales)
Storico_G <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx",sheet = "Storico_G", startRow = 1, colNames = TRUE)
Storico_G1 <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx", startRow = 1, colNames = TRUE)
# Selecting Column C,E,R from Storico_G and stored in variable Storico_G_df
# Selecting Column A,P from Storico_G+1 and stored in variable Storico_G1_df
Storico_G_df <- data.frame(Storico_G$pubblicazione,Storico_G$IMMESSO, Storico_G$`RICONSEGNATO.(1)`, Storico_G$BILANCIAMENTO.RESIDUALE )
Storico_G1_df <- data.frame(Storico_G1$pubblicazione, Storico_G1$`SBILANCIAMENTO.ATTESO.DEL.SISTEMA.(SAS)`)
# Conerting pubblicazione in date format and time
Storico_G_df$pubblicazione <- ymd_h(Storico_G_df$Storico_G.pubblicazione)
Storico_G1_df$pubblicazione <- ymd_h(Storico_G1_df$Storico_G1.pubblicazione)
# Selecting on row which is having 4PM value in Storico_G+1 excel sheet tab
Storico_G1_df <- subset(Storico_G1_df, hour(Storico_G1_df$pubblicazione) == 16)
rownames(Storico_G1_df) <- 1:nrow(Storico_G1_df)
# Averaging hourly values to 1 daily data point in G excel sheet tab
Storico_G_df$Storico_G.pubblicazione <- strptime(Storico_G_df$Storico_G.pubblicazione, "%Y_%m_%d_%H")
storico_G_df_agg <- aggregate(Storico_G_df, by=list(day=format(Storico_G_df$Storico_G.pubblicazione, "%F")), FUN=mean, na.rm=TRUE)[,-2]
#cbind.fill function
cbind.fill <- function(...){
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
#cbind with both frames
G_G1_df= data.frame(cbind.fill(storico_G_df_agg,Storico_G1_df))
#keep required columns
keep=c("day", "Storico_G.IMMESSO","Storico_G..RICONSEGNATO..1..","Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..")
#update dataframe to kept variables
G_G1_df=G_G1_df[,keep,drop=FALSE]
#Rename crazy variable names
G_G1_df <- data.frame(G_G1_df) %>%
select(day, Storico_G.IMMESSO, Storico_G..RICONSEGNATO..1.., Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..)
names(G_G1_df) <- c("day", "Immesso","Riconsegnato", "SAS")
#Melt time series
G_G1_df=melt(G_G1_df,id.vars = "day")
#Create group variable
G_G1_df$group<- ifelse(G_G1_df$variable == "SAS", "SAS", "Immesso/Consegnato")
#plot
ggplot(G_G1_df, aes(as.Date(day),as.numeric(value),col=variable))+geom_point()+geom_line()+facet_wrap(~group,ncol=1,scales="free_y")+labs(x="Month", y="Values") +scale_x_date(labels=date_format("%m-%Y"))+geom_abline(intercept=c(-2,0,2),slope=0,data=subset(G_G1_df,group=="SAS"),lwd=0.5,lty=2)
I'm struggling with the following issue: I have many data frames with different names (For instance, Beverage, Construction, Electronic etc., dim. 540x1000). I need to clean each of them, calculate and save as zoo object and R data file. Cleaning is the same for all of them - deleting the empty columns and the columns with some specific names.
For example:
Beverages <- Beverages[,colSums(is.na(Beverages))<nrow(Beverages)] #removing empty columns
Beverages_OK <- Beverages %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
Beverages_OK[, 1] <- NULL #dropping the first column
Beverages_OK <- cbind(data[1], Beverages_OK) # adding a date column
Beverages_zoo <- read.zoo(Beverages_OK, header = FALSE, format = "%Y-%m-%d")
save (Beverages_OK, file = "StatisticsInRFormat/Beverages.RData")
I tied to use 'lapply' function like this:
list <- ls() # the list of all the dataframes
lapply(list, function(X) {
temp <- X
temp <- temp [,colSums(is.na(temp))< nrow(temp)] #removing empty columns
temp <- temp %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
temp[, 1] <- NULL
temp <- cbind(data[1], temp)
X_zoo <- read.zoo(X, header = FALSE, format = "%Y-%m-%d") # I don't know how to have the zame name as X has.
save (X, file = "StatisticsInRFormat/X.RData")
})
but it doesn't work. Is any way to do such a job? Is any r-package that facilitates it?
Thanks a lot.
If you are sure the you have only the needed data frames in the environment this should get you started:
df1 <- mtcars
df2 <- mtcars
df3 <- mtcars
list <- ls()
lapply(list, function(x) {
tmp <- get(x)
})