I am trying to use a for loop to multiple create dataframes. The original code works fine for a single run (without the for loop).
object<- c(1,2,3)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
N.rows <- length(object)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(df1,df2,df3))
When I do this I get combined[3] but not the outputs from the two other variables in my object. I have toyed around with it and managed to get as a result combined1, but again not a list with combined1, combined[2], and combined[3].
UPDATE: I was asked for the concrete example and expected output.
I'm basically taking three CSV files of county census data but organized differently (two with years as rows, and one with years as columns), transforming the data into a consistent format by county and then combining the files.
The above image is the result of View(combine). [[2]] is just what I want, but nothing is stored in [1].
This is the code that I used to get to it:
pop1990.2000 <- read.csv("1990-2000 Census Pop.csv",
stringsAsFactors = FALSE)
pop2000.2010 <- read.csv("2000-2010 Census Pop.csv",
stringsAsFactors = FALSE)
pop2010.2019 <- read.csv("2010-2019 Census Pop.csv",
stringsAsFactors = FALSE)
#Adding Total column "Population"
pop1990.2000$Population <- (rowSums(pop1990.2000) -
pop1990.2000$Year -
pop1990.2000$FIPS.Code)
#Combining State and County FIPS codes "FIPS.Code"
pop2000.2010$FIPS.Code <- (pop2000.2010$STATE*1000+
pop2000.2010$COUNTY)
pop2010.2019$FIPS.Code <- (pop2010.2019$STATE*1000+
pop2010.2019$COUNTY)
my_counties<-c(1125, 1127)
for (i in 1:length(my_counties)) {
#Selecting Pop data for County 1125 for 1990-2000
newdata <- pop1990.2000[ which(pop1990.2000$FIPS.Code==my_counties[i]), ]
newdata2000v1 <- as.data.frame(cbind(Year=newdata$Year,
Population=newdata$Pop))
#Adding FIPs Code
newdata2000v1$FIPS.Code<-my_counties[i]
#Merging County Name by FIPS.Code
pop2000.2010.c.fips <- pop2000.2010 %>%
select(FIPS.Code, CTYNAME)
pop2000.2010.c.fips$County<-pop2000.2010.c.fips$CTYNAME
newdata2000v1 <- newdata2000v1 %>%
mutate(FIPS.Code = as.numeric(FIPS.Code))
newdata2000 <- left_join(newdata2000v1,
pop2000.2010.c.fips,
by = "FIPS.Code")
newdata2000<-newdata2000 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2000-2010
newdata2 <- pop2000.2010[ which(pop2000.2010$FIPS.Code==my_counties[i]), ]
newdata2010 <- cbind("2000"=newdata2$ESTIMATESBASE2000,
"2001"=newdata2$POPESTIMATE2001,
"2002"=newdata2$POPESTIMATE2002,
"2003"=newdata2$POPESTIMATE2003,
"2004"=newdata2$POPESTIMATE2004,
"2005"=newdata2$POPESTIMATE2005,
"2006"=newdata2$POPESTIMATE2006,
"2007"=newdata2$POPESTIMATE2007,
"2008"=newdata2$POPESTIMATE2008,
"2009"=newdata2$POPESTIMATE2009)
newdata2010<-as.data.frame(t(newdata2010))
newdata2010$County<-newdata2$CTYNAME
newdata2010$FIPS.Code<-newdata2$FIPS.Code
newdata2010$Year<-c(rownames(newdata2010))
names(newdata2010)[names(newdata2010) == 'V1'] <- 'Population'
newdata2010<-newdata2010 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2010-2019
newdata3 <- pop2010.2019[ which(pop2010.2019$FIPS.Code==my_counties[i]), ]
newdata2019 <- cbind(Year=newdata3$Year,
"2010"=newdata3$CENSUS2010POP,
"2011"=newdata3$POPESTIMATE2011,
"2012"=newdata3$POPESTIMATE2012,
"2013"=newdata3$POPESTIMATE2013,
"2014"=newdata3$POPESTIMATE2014,
"2015"=newdata3$POPESTIMATE2015,
"2016"=newdata3$POPESTIMATE2016,
"2017"=newdata3$POPESTIMATE2017,
"2018"=newdata3$POPESTIMATE2018,
"2019"=newdata3$POPESTIMATE2019)
newdata2019<-as.data.frame(t(newdata2019))
newdata2019$County<-newdata3$CTYNAME
newdata2019$FIPS.Code<-newdata3$FIPS.Code
newdata2019$Year<-c(rownames(newdata2019))
names(newdata2019)[names(newdata2019) == 'V1'] <- 'Population'
newdata2019<-newdata2019 %>% select(County, FIPS.Code, Year, Population)
N.rows <- length(my_counties)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(newdata2000,newdata2010,newdata2019))
The problem is that you are re-specifying the creation of the combined object.
I am not sure what exactly your var1 is, but, possibly the following should work:
object<- c(1,2,3)
N.rows <- length(var1)
combined <- vector("list", N.rows)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
combined[i]<-list(rbind(df1,df2,df3))
}
Alternatively, using lapply:
object<- c(1,2,3)
combined<-lapply(object, function(i){
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
list(rbind(df1,df2,df3))
}
But this will deliver the list of length 3 (with three lists with df1,df2 and df3), not the length defined by the length of var1...
Related
I have access from scraped government data, which comes in a less the useful structure. Its 20 dfs that looks like:
Each df is named by the year, eg, X2006.csv is imported in a X2006 dataframe object. They all have a Total row and a total column, which I will deal with later. Now, my question is:
How do I merge these dfs, adding a column with the year information from the dataframe name?
Instead of using loop you can use do.call:
require(tidyverse)
#Creating sample data set - 10 data.frames
for (i in 1:9){
assign(paste("x200", i, sep=""),
data.frame(x = c(1:20)))
}
#Creating the big data.frame
MyBigDataframe <- do.call(rbind.data.frame, mget(ls(pattern = "x20"))) %>%
rownames_to_column("file")
startYear <- 1998
endYear <- 2017
myData <- data.frame()
require(plyr)
for (myYear in startYear:endYear){
df <- get(paste0("X", as.character(myYear)))
df$Year <- myYear
myData <- rbind.fill(myData, df)
}
I have a process to create a df for a single weather station over a singular month period. However, I have about 25 stations that I would like to aquire precipitation data for over a 5 year period.
I have the station ids in a df, that looks like the table below (but with 23 more stations.
stationid County
GHCND:USW00093721 ANNEARUNDEL
GHCND:USC00182308 BALTIMORE
The weather dataset is aquired through the following code
library("rnoaa")
ANNEARUNDEL_2006 <- ncdc(datasetid='GHCND', stationid = "GHCND:USC00182060", datatypeid='PRCP', startdate = '2006-07-01', enddate = '2006-08-01', limit=400, token = "API KEY")
ANNEARUNDEL_2006 <- ANNEARUNDEL_2006$data
I familiar with very basic for loops that work for one process. Is there a way to set this up the loop would create a new df using the county name and year over the span of 2006 to 2011 for all 25 stations? Is a loop the best way to accomplish this?
You could do something like this. Set up a function to read in the data, then loop through your df with mapply, and for each year with lapply. The output will be a named list of data (vectors as it stands, although you could capture more columns of df if you wanted, in which case they would be dataframes).
getNCDC <- function(id,County,year){
df <- ncdc(datasetid='GHCND', stationid = id, datatypeid='PRCP', startdate = paste0(year,'-07-01'), enddate = paste0(year,'-08-01'), limit=400, token = "API KEY")
df <- list(df$data)
names(df) <- paste(County,year,sep="_")
return(df)
}
allData <- lapply(2006:2011,function(year) mapply(getNCDC,df$stationid,df$County,year))
I like loops for things like this because they are easier to read and write. You could do it like this with two loops:
my_df <- read.table(text = "stationid County
GHCND:USW00093721 ANNEARUNDEL
GHCND:USC00182308 BALTIMORE",
header = T)
library(rnoaa)
results <- list() # list as storage variable for the loop results
i <- 1 # indexing variable
for(sid in unique(my_df$stationid)) { # each station in your stationid dataframe
for(year in 2006:2011) { # each year you care about
data <- ncdc(datasetid='GHCND', stationid = sid,
datatypeid='PRCP', startdate = paste0(year, '-01-01'),
enddate = paste0(year, '-12-31'), limit=400, token = "API KEY")$data # subset the returned list right away here with $data
# add info from each loop iteration
data$county <- my_df[my_df$stationid == sid,]$County
data$year <- year
results[[i]] <- data # store it
i <- i + 1 # rinse and repeat
}
}
one_big_df <- do.call(rbind, results) # stack all of the data frames together rowwise
Of course, you could always adjust a for loop to using lapply or it's friends. If speed became an issue you might want to consider it.
The following solution uses funcitons from the rnoaa and tidyverse package.
Notice that I used the ghcnd_search to download the precipitation data.
# Load packages
library(rnoaa)
library(tidyverse)
# Create example data frame
sample_df <- data.frame(stationid = c("USW00093721", "USC00182308"),
County = c("ANNEARUNDEL", "BALTIMORE"),
stringsAsFactors = FALSE)
# Download the data use map.
data_list <- map(sample_df$stationid, ghcnd_search,
date_min = "2006-01-01", date_max = "2011-12-31", var = "prcp")
Now the prcp data from each station are downloaded as a data frame. They are all stroed in the data_list as a list.
You can access the data of each station by accessing the list, or you can convert the data in the list to a single data frame. Here is an example:
# Transpost the data_list. Turns a list-of-lists "inside-out"
data_list2 <- transpose(data_list)
# Combine all data to a single data frame
data_df <- bind_rows(data_list2$prcp)
Now all the data are in data_df as a data frame
I'm struggling with the following issue: I have many data frames with different names (For instance, Beverage, Construction, Electronic etc., dim. 540x1000). I need to clean each of them, calculate and save as zoo object and R data file. Cleaning is the same for all of them - deleting the empty columns and the columns with some specific names.
For example:
Beverages <- Beverages[,colSums(is.na(Beverages))<nrow(Beverages)] #removing empty columns
Beverages_OK <- Beverages %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
Beverages_OK[, 1] <- NULL #dropping the first column
Beverages_OK <- cbind(data[1], Beverages_OK) # adding a date column
Beverages_zoo <- read.zoo(Beverages_OK, header = FALSE, format = "%Y-%m-%d")
save (Beverages_OK, file = "StatisticsInRFormat/Beverages.RData")
I tied to use 'lapply' function like this:
list <- ls() # the list of all the dataframes
lapply(list, function(X) {
temp <- X
temp <- temp [,colSums(is.na(temp))< nrow(temp)] #removing empty columns
temp <- temp %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
temp[, 1] <- NULL
temp <- cbind(data[1], temp)
X_zoo <- read.zoo(X, header = FALSE, format = "%Y-%m-%d") # I don't know how to have the zame name as X has.
save (X, file = "StatisticsInRFormat/X.RData")
})
but it doesn't work. Is any way to do such a job? Is any r-package that facilitates it?
Thanks a lot.
If you are sure the you have only the needed data frames in the environment this should get you started:
df1 <- mtcars
df2 <- mtcars
df3 <- mtcars
list <- ls()
lapply(list, function(x) {
tmp <- get(x)
})
Similar questions have been asked already but none was able to solve my specific problem. I have a .R file ("Mycalculus.R") containing many basic calculus that I need to apply to subsets of a dataframe: one subset for each year where the modalities of "year" are factors (yearA, yearB, yearC) not numeric values. The file generates a new dataframe that I need to save in a Rda file. Here is what I expect the code to look like with a for loop (this one obviously do not work):
id <- identif(unlist(df$year))
for (i in 1:length(id)){
data <- subset(df, year == id[i])
source ("Mycalculus.R", echo=TRUE)
save(content_df1,file="myresults.Rda")
}
Here is an exact of the main data.frame df:
obs year income gender ageclass weight
1 yearA 1000 F 1 10
2 yearA 1200 M 2 25
3 yearB 1400 M 2 5
4 yearB 1350 M 1 11
Here is what the sourced file "Mycalculus.R" do: it applies numerous basic calculus to columns of the dataframe called "data", and creates two new dataframes df1 and then df2 based on df1. Here is an extract:
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
library(GiniWegNeg)
gini=c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1=data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c("content_df1")
df2=(1/5)*df1$Income_gender+df2$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c("content_df2")
So that in the end, I get two dataframes like this:
Income_Gender Income_Ageclass
content_df1 .... ....
And for df2:
myresult
content_df2 ....
But I need to save df1 and Rf2 as a Rda file where the row names of content_df1 and content_df2 are given per subset, something like this:
Income_Gender Income_Ageclass
content_df1_yearA .... ....
content_df1_yearB .... ....
content_df1_yearC .... ....
and
myresult
content_df2_yearA ....
content_df2_yearB ....
content_df2_yearC ....
Currently, my program does not use any loop and is doing the job but messily. Basically the code is more than 2500 lines of code. (please don't throw tomatoes at me).
Anyone could help me with this specific request?
Thank you in advance.
Consider incorporating all in one script with a defined function of needed arguments, called by lapply(). Lapply then returns a list of dataframes that you can rowbind into one final df.
library(dplyr)
library(GiniWegNeg)
runIncomeCalc <- function(data, y){
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1 <- data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c(paste0("content_df1_", y))
return(df1)
}
runResultsCalc <- function(df, y){
df2 <- (1/5) * df$Income_gender + df$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c(paste0("content_df2_", y)
return(df2)
}
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
Now if you need to source across scripts. Create same two functions, runIncomeCalc and runResultsCalc in Mycalculus.R and then call each in other script:
library(dplyr)
library(GiniWegNeg)
if(!exists("runIncomeCalc", mode="function")) source("Mycalculus.R")
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
If you functional-ize your steps you can create a workflow like the following:
calcFunc <- function(df) {
## Do something to the df, then return it
df
}
processFunc <- function(fname) {
## Read in your table
x <- read.table(fname)
## Do the calculation
x <- calcFunc(x)
## Make a new file name (remember to change the file extension)
new_fname <- sub("something", "else", fname)
## Write the .RData file
save(x, file = new_fname)
}
### Your workflow
## Generate a vector of files
my_files <- list.files()
## Do the work
res <- lapply(my_files, processFunc)
Alternatively, don't save the files. Omit the save call in the processFunc, and return a list of data.frame objects. Then use either data.table::rbindlist(res) or do.call(rbind, list) to make one large data.frame object.
I have a function that takes two arguments, a link and and an State name abbreviation. The function gets a spreadsheet from a remote site and
makes me a data frame of the state data that I want.
library(lubridate)
library(tidyr)
library(gdata)
library(dplyr)
options(stringsAsFactors = FALSE)
nsw_link <- 'http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101051.xls&3101.0&Time%20Series%20Spreadsheet&3F92BFA30BC29940CA257F1D001427C3&0&Jun%202015&17.12.2015&Latest'
qld_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101053-.xls&3101.0&Time%20Series%20Spreadsheet&2927EBD7E6856BABCA257F1D0014283D&0&Jun%202015&17.12.2015&Latest"
vic_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101052.xls&3101.0&Time%20Series%20Spreadsheet&E3B2958632AB29ECCA257F1D001427FB&0&Jun%202015&17.12.2015&Latest"
nt_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101057.xls&3101.0&Time%20Series%20Spreadsheet&CCB60AB638D60938CA257F1D0014291C&0&Jun%202015&17.12.2015&Latest"
tas_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101056.xls&3101.0&Time%20Series%20Spreadsheet&8CA5625A306A4805CA257F1D001428D7&0&Jun%202015&17.12.2015&Latest"
act_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101058.xls&3101.0&Time%20Series%20Spreadsheet&4234206BA89A82F6CA257F1D00142959&0&Jun%202015&17.12.2015&Latest"
sa_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101054.xls&3101.0&Time%20Series%20Spreadsheet&A342BFFB06A62F4FCA257F1D0014286D&0&Jun%202015&17.12.2015&Latest"
wa_link <- "http://www.abs.gov.au/ausstats/abs#archive.nsf/log?openagent&3101055.xls&3101.0&Time%20Series%20Spreadsheet&DC6208699BE4D2FFCA257F1D0014289E&0&Jun%202015&17.12.2015&Latest"
get_ERP_data <- function(link, state){
# Get the xls file and slice only the columns needed for male and female ERPs
xls_data <- read.xls(link, sheet = 'Data1')
xls_data <- tbl_df(xls_data)
xls_data <- xls_data[, 1:203]
names(xls_data) <- gsub(pattern = "Estimated.Resident.Population....", '', names(xls_data))
names(xls_data) <- gsub('[.]+', '', names(xls_data))
names(xls_data) <- gsub('100andover', '101', names(xls_data))
names(xls_data) <- gsub("(\\d+)$", ".\\1", names(xls_data))
xls_data <- xls_data[28:54, ]
names(xls_data)[1] <- 'Year'
xls_data$Year <- paste('01', xls_data$Year, sep = '-')
xls_data$Year <- dmy(xls_data$Year)
xls_data$Year <- year(xls_data$Year)
# make a long version of the ERP data
xls_data_long <- xls_data %>% gather(Sex_Age, Population, Male.0:Female.101)
# Make two new column, sex and age, from the sex_age column
xls_data_sep_log <- xls_data_long %>% separate(Sex_Age, c('Sex', 'Age'))
# Recode observations from 101 to 'over 100'
xls_data_sep_log$Age[grep(xls_data_sep_log$Age, pattern = '101')] <- 'Over 100'
xls_data_sep_log$State <- state
df_name <- paste(state, 'data', sep = '_')
assign(df_name,xls_data_sep_log,envir = .GlobalEnv)
}
link_list <- list(act_link, nsw_link, nt_link, qld_link, sa_link, tas_link, vic_link, wa_link)
states <- c('ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA')
For example, if I want Queensland estimated resident population data I run:
get_ERP_data(qld_link, 'QLD')
This makes me a longitudinal data frame of the data I need.
I would like to extend this so that I can pass a vector or list of states to a list of links as parameter and return a data frame for each
argument in the paramter? I am not particularly concerned with binding resulting data frames in the function output.
Similar to the example above, how could I get my function to run something like:
get_ERP_data(as.list(qld_link, nsw_link), c('QLD', 'NSW'))
I tried using variations of sapply and do.call but did not get very far. Is there a way to use some equivalent of pythons *args/**kwargs
to pass lists as arguments to my function?
Any help would be appreciated.