Merge dataframes adding date information from filename - r

I have access from scraped government data, which comes in a less the useful structure. Its 20 dfs that looks like:
Each df is named by the year, eg, X2006.csv is imported in a X2006 dataframe object. They all have a Total row and a total column, which I will deal with later. Now, my question is:
How do I merge these dfs, adding a column with the year information from the dataframe name?

Instead of using loop you can use do.call:
require(tidyverse)
#Creating sample data set - 10 data.frames
for (i in 1:9){
assign(paste("x200", i, sep=""),
data.frame(x = c(1:20)))
}
#Creating the big data.frame
MyBigDataframe <- do.call(rbind.data.frame, mget(ls(pattern = "x20"))) %>%
rownames_to_column("file")

startYear <- 1998
endYear <- 2017
myData <- data.frame()
require(plyr)
for (myYear in startYear:endYear){
df <- get(paste0("X", as.character(myYear)))
df$Year <- myYear
myData <- rbind.fill(myData, df)
}

Related

Saving Multiple Outputs from a for loop in r

I am trying to use a for loop to multiple create dataframes. The original code works fine for a single run (without the for loop).
object<- c(1,2,3)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
N.rows <- length(object)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(df1,df2,df3))
When I do this I get combined[3] but not the outputs from the two other variables in my object. I have toyed around with it and managed to get as a result combined1, but again not a list with combined1, combined[2], and combined[3].
UPDATE: I was asked for the concrete example and expected output.
I'm basically taking three CSV files of county census data but organized differently (two with years as rows, and one with years as columns), transforming the data into a consistent format by county and then combining the files.
The above image is the result of View(combine). [[2]] is just what I want, but nothing is stored in [1].
This is the code that I used to get to it:
pop1990.2000 <- read.csv("1990-2000 Census Pop.csv",
stringsAsFactors = FALSE)
pop2000.2010 <- read.csv("2000-2010 Census Pop.csv",
stringsAsFactors = FALSE)
pop2010.2019 <- read.csv("2010-2019 Census Pop.csv",
stringsAsFactors = FALSE)
#Adding Total column "Population"
pop1990.2000$Population <- (rowSums(pop1990.2000) -
pop1990.2000$Year -
pop1990.2000$FIPS.Code)
#Combining State and County FIPS codes "FIPS.Code"
pop2000.2010$FIPS.Code <- (pop2000.2010$STATE*1000+
pop2000.2010$COUNTY)
pop2010.2019$FIPS.Code <- (pop2010.2019$STATE*1000+
pop2010.2019$COUNTY)
my_counties<-c(1125, 1127)
for (i in 1:length(my_counties)) {
#Selecting Pop data for County 1125 for 1990-2000
newdata <- pop1990.2000[ which(pop1990.2000$FIPS.Code==my_counties[i]), ]
newdata2000v1 <- as.data.frame(cbind(Year=newdata$Year,
Population=newdata$Pop))
#Adding FIPs Code
newdata2000v1$FIPS.Code<-my_counties[i]
#Merging County Name by FIPS.Code
pop2000.2010.c.fips <- pop2000.2010 %>%
select(FIPS.Code, CTYNAME)
pop2000.2010.c.fips$County<-pop2000.2010.c.fips$CTYNAME
newdata2000v1 <- newdata2000v1 %>%
mutate(FIPS.Code = as.numeric(FIPS.Code))
newdata2000 <- left_join(newdata2000v1,
pop2000.2010.c.fips,
by = "FIPS.Code")
newdata2000<-newdata2000 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2000-2010
newdata2 <- pop2000.2010[ which(pop2000.2010$FIPS.Code==my_counties[i]), ]
newdata2010 <- cbind("2000"=newdata2$ESTIMATESBASE2000,
"2001"=newdata2$POPESTIMATE2001,
"2002"=newdata2$POPESTIMATE2002,
"2003"=newdata2$POPESTIMATE2003,
"2004"=newdata2$POPESTIMATE2004,
"2005"=newdata2$POPESTIMATE2005,
"2006"=newdata2$POPESTIMATE2006,
"2007"=newdata2$POPESTIMATE2007,
"2008"=newdata2$POPESTIMATE2008,
"2009"=newdata2$POPESTIMATE2009)
newdata2010<-as.data.frame(t(newdata2010))
newdata2010$County<-newdata2$CTYNAME
newdata2010$FIPS.Code<-newdata2$FIPS.Code
newdata2010$Year<-c(rownames(newdata2010))
names(newdata2010)[names(newdata2010) == 'V1'] <- 'Population'
newdata2010<-newdata2010 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2010-2019
newdata3 <- pop2010.2019[ which(pop2010.2019$FIPS.Code==my_counties[i]), ]
newdata2019 <- cbind(Year=newdata3$Year,
"2010"=newdata3$CENSUS2010POP,
"2011"=newdata3$POPESTIMATE2011,
"2012"=newdata3$POPESTIMATE2012,
"2013"=newdata3$POPESTIMATE2013,
"2014"=newdata3$POPESTIMATE2014,
"2015"=newdata3$POPESTIMATE2015,
"2016"=newdata3$POPESTIMATE2016,
"2017"=newdata3$POPESTIMATE2017,
"2018"=newdata3$POPESTIMATE2018,
"2019"=newdata3$POPESTIMATE2019)
newdata2019<-as.data.frame(t(newdata2019))
newdata2019$County<-newdata3$CTYNAME
newdata2019$FIPS.Code<-newdata3$FIPS.Code
newdata2019$Year<-c(rownames(newdata2019))
names(newdata2019)[names(newdata2019) == 'V1'] <- 'Population'
newdata2019<-newdata2019 %>% select(County, FIPS.Code, Year, Population)
N.rows <- length(my_counties)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(newdata2000,newdata2010,newdata2019))
The problem is that you are re-specifying the creation of the combined object.
I am not sure what exactly your var1 is, but, possibly the following should work:
object<- c(1,2,3)
N.rows <- length(var1)
combined <- vector("list", N.rows)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
combined[i]<-list(rbind(df1,df2,df3))
}
Alternatively, using lapply:
object<- c(1,2,3)
combined<-lapply(object, function(i){
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
list(rbind(df1,df2,df3))
}
But this will deliver the list of length 3 (with three lists with df1,df2 and df3), not the length defined by the length of var1...

Multi step loop to acquire weather data over years and stations

I have a process to create a df for a single weather station over a singular month period. However, I have about 25 stations that I would like to aquire precipitation data for over a 5 year period.
I have the station ids in a df, that looks like the table below (but with 23 more stations.
stationid County
GHCND:USW00093721 ANNEARUNDEL
GHCND:USC00182308 BALTIMORE
The weather dataset is aquired through the following code
library("rnoaa")
ANNEARUNDEL_2006 <- ncdc(datasetid='GHCND', stationid = "GHCND:USC00182060", datatypeid='PRCP', startdate = '2006-07-01', enddate = '2006-08-01', limit=400, token = "API KEY")
ANNEARUNDEL_2006 <- ANNEARUNDEL_2006$data
I familiar with very basic for loops that work for one process. Is there a way to set this up the loop would create a new df using the county name and year over the span of 2006 to 2011 for all 25 stations? Is a loop the best way to accomplish this?
You could do something like this. Set up a function to read in the data, then loop through your df with mapply, and for each year with lapply. The output will be a named list of data (vectors as it stands, although you could capture more columns of df if you wanted, in which case they would be dataframes).
getNCDC <- function(id,County,year){
df <- ncdc(datasetid='GHCND', stationid = id, datatypeid='PRCP', startdate = paste0(year,'-07-01'), enddate = paste0(year,'-08-01'), limit=400, token = "API KEY")
df <- list(df$data)
names(df) <- paste(County,year,sep="_")
return(df)
}
allData <- lapply(2006:2011,function(year) mapply(getNCDC,df$stationid,df$County,year))
I like loops for things like this because they are easier to read and write. You could do it like this with two loops:
my_df <- read.table(text = "stationid County
GHCND:USW00093721 ANNEARUNDEL
GHCND:USC00182308 BALTIMORE",
header = T)
library(rnoaa)
results <- list() # list as storage variable for the loop results
i <- 1 # indexing variable
for(sid in unique(my_df$stationid)) { # each station in your stationid dataframe
for(year in 2006:2011) { # each year you care about
data <- ncdc(datasetid='GHCND', stationid = sid,
datatypeid='PRCP', startdate = paste0(year, '-01-01'),
enddate = paste0(year, '-12-31'), limit=400, token = "API KEY")$data # subset the returned list right away here with $data
# add info from each loop iteration
data$county <- my_df[my_df$stationid == sid,]$County
data$year <- year
results[[i]] <- data # store it
i <- i + 1 # rinse and repeat
}
}
one_big_df <- do.call(rbind, results) # stack all of the data frames together rowwise
Of course, you could always adjust a for loop to using lapply or it's friends. If speed became an issue you might want to consider it.
The following solution uses funcitons from the rnoaa and tidyverse package.
Notice that I used the ghcnd_search to download the precipitation data.
# Load packages
library(rnoaa)
library(tidyverse)
# Create example data frame
sample_df <- data.frame(stationid = c("USW00093721", "USC00182308"),
County = c("ANNEARUNDEL", "BALTIMORE"),
stringsAsFactors = FALSE)
# Download the data use map.
data_list <- map(sample_df$stationid, ghcnd_search,
date_min = "2006-01-01", date_max = "2011-12-31", var = "prcp")
Now the prcp data from each station are downloaded as a data frame. They are all stroed in the data_list as a list.
You can access the data of each station by accessing the list, or you can convert the data in the list to a single data frame. Here is an example:
# Transpost the data_list. Turns a list-of-lists "inside-out"
data_list2 <- transpose(data_list)
# Combine all data to a single data frame
data_df <- bind_rows(data_list2$prcp)
Now all the data are in data_df as a data frame

How to read and use the dataframes with the different names in a loop?

I'm struggling with the following issue: I have many data frames with different names (For instance, Beverage, Construction, Electronic etc., dim. 540x1000). I need to clean each of them, calculate and save as zoo object and R data file. Cleaning is the same for all of them - deleting the empty columns and the columns with some specific names.
For example:
Beverages <- Beverages[,colSums(is.na(Beverages))<nrow(Beverages)] #removing empty columns
Beverages_OK <- Beverages %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
Beverages_OK[, 1] <- NULL #dropping the first column
Beverages_OK <- cbind(data[1], Beverages_OK) # adding a date column
Beverages_zoo <- read.zoo(Beverages_OK, header = FALSE, format = "%Y-%m-%d")
save (Beverages_OK, file = "StatisticsInRFormat/Beverages.RData")
I tied to use 'lapply' function like this:
list <- ls() # the list of all the dataframes
lapply(list, function(X) {
temp <- X
temp <- temp [,colSums(is.na(temp))< nrow(temp)] #removing empty columns
temp <- temp %>% select (-starts_with ("X.ERROR")) # dropping X.ERROR column
temp[, 1] <- NULL
temp <- cbind(data[1], temp)
X_zoo <- read.zoo(X, header = FALSE, format = "%Y-%m-%d") # I don't know how to have the zame name as X has.
save (X, file = "StatisticsInRFormat/X.RData")
})
but it doesn't work. Is any way to do such a job? Is any r-package that facilitates it?
Thanks a lot.
If you are sure the you have only the needed data frames in the environment this should get you started:
df1 <- mtcars
df2 <- mtcars
df3 <- mtcars
list <- ls()
lapply(list, function(x) {
tmp <- get(x)
})

Call a function for the number of items in a data frame and merge the results together

I have a function called getWeatherForMonth that takes a start date and end date and returns as data frame of the result for each month. I have another method getWeatherForRange that takes a data frame of ranges. I need to call getWeatherForMonth for each row in the "dates" and combine the results into one data frame. I was using mapply like below but it's not combining the resulting data frames.
library(RJSONIO)
getWeatherForMonth <- function(start.date, end.date) {
url <- "http://api.worldweatheronline.com/premium/v1/past-weather.ashx?key=PUT-YOUR-KEY-HERE&q=London&format=json&date=%s&enddate=%e&tp=24"
url <- gsub("%s", start.date, url)
url <- url <- gsub("%e", end.date, url)
data <- fromJSON(url)
weather <- data$data$weather
GMT <- sapply(weather, function(x){as.character(x[1])})
Max.TemperatureC <- sapply(weather, function(x){as.numeric(x[3])})
Min.TemperatureC <- sapply(weather, function(x){as.numeric(x[4])})
Wind.SpeedKm.h <- sapply(weather, function(x){as.numeric(x$hourly[[1]]$windspeedKmph[1])})
Precipitationmm <- sapply(weather, function(x){as.numeric(x$hourly[[1]]$precipMM[1])})
DewPointC <-sapply(weather, function(x){as.numeric(x$hourly[[1]]$DewPointC[1])})
Wind.Chill <-sapply(weather, function(x){as.numeric(x$hourly[[1]]$WindChillC[1])})
Cloud.Cover <-sapply(weather, function(x){as.numeric(x$hourly[[1]]$cloudcover[1])})
Description <-sapply(weather, function(x){as.character(x$hourly[[1]]$weatherDesc[1])})
Humidity <- sapply(weather, function(x){as.numeric(x$hourly[[1]]$humidity[1])})
Feels.LikeC <- sapply(weather, function(x){as.numeric(x$hourly[[1]]$FeelsLikeC[1])})
df <- data.frame(GMT, Max.TemperatureC, Min.TemperatureC, Wind.SpeedKm.h, Precipitationmm, DewPointC, Wind.Chill, Cloud.Cover, Description, Humidity, Feels.LikeC)
return(df)
}
getWeatherForRange <- function(dates) {
df <- mapply(getWeatherForMonth, dates$start.date, dates$end.date)
return(df)
}
start.date <- seq(as.Date("2015-01-01"), length=12, by="1 month")
end.date <- seq(as.Date("2015-02-01"),length=12,by="months") - 1
dates.2015 <- data.frame(start.date, end.date)
data <- getWeatherForRange(dates)
View(data)
The output looks like this
Screenshot of the current output
Consider using Map(). Specifically, in your getWeatherForRange function, use Map() which is actually a wrapper for the non-simplified version of mapply(), equivalent to mapply(..., SIMPLIFY=FALSE). By default, mapply() returns a vector, matrix, or higher dimensional array. But you require a dataframe (i.e., a list object) return.
This updated function will return a list of dataframes that you can then later run a do.call(rbind, ...), assuming all columns are consistent in each df, to stack all dfs together for a final dataframe.
getWeatherForRange <- function(dates) {
# EQUIVALENT LINES
dfList <- Map(getWeatherForMonth, dates$start.date, dates$end.date)
# dfList <- mapply(getWeatherForMonth, dates$start.date, dates$end.date, SIMPLIFY = FALSE)
return(dfList)
}
start.date <- seq(as.Date("2015-01-01"), length=12, by="1 month")
end.date <- seq(as.Date("2015-02-01"), length=12, by="months") - 1
dates <- data.frame(start.date, end.date)
datalist <- getWeatherForRange(dates) # DATAFRAME LIST
data <- do.call(rbind, datalist) # FINAL DATA FRAME

Loop over a subset, source a file and save results in a dataframe

Similar questions have been asked already but none was able to solve my specific problem. I have a .R file ("Mycalculus.R") containing many basic calculus that I need to apply to subsets of a dataframe: one subset for each year where the modalities of "year" are factors (yearA, yearB, yearC) not numeric values. The file generates a new dataframe that I need to save in a Rda file. Here is what I expect the code to look like with a for loop (this one obviously do not work):
id <- identif(unlist(df$year))
for (i in 1:length(id)){
data <- subset(df, year == id[i])
source ("Mycalculus.R", echo=TRUE)
save(content_df1,file="myresults.Rda")
}
Here is an exact of the main data.frame df:
obs year income gender ageclass weight
1 yearA 1000 F 1 10
2 yearA 1200 M 2 25
3 yearB 1400 M 2 5
4 yearB 1350 M 1 11
Here is what the sourced file "Mycalculus.R" do: it applies numerous basic calculus to columns of the dataframe called "data", and creates two new dataframes df1 and then df2 based on df1. Here is an extract:
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
library(GiniWegNeg)
gini=c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1=data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c("content_df1")
df2=(1/5)*df1$Income_gender+df2$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c("content_df2")
So that in the end, I get two dataframes like this:
Income_Gender Income_Ageclass
content_df1 .... ....
And for df2:
myresult
content_df2 ....
But I need to save df1 and Rf2 as a Rda file where the row names of content_df1 and content_df2 are given per subset, something like this:
Income_Gender Income_Ageclass
content_df1_yearA .... ....
content_df1_yearB .... ....
content_df1_yearC .... ....
and
myresult
content_df2_yearA ....
content_df2_yearB ....
content_df2_yearC ....
Currently, my program does not use any loop and is doing the job but messily. Basically the code is more than 2500 lines of code. (please don't throw tomatoes at me).
Anyone could help me with this specific request?
Thank you in advance.
Consider incorporating all in one script with a defined function of needed arguments, called by lapply(). Lapply then returns a list of dataframes that you can rowbind into one final df.
library(dplyr)
library(GiniWegNeg)
runIncomeCalc <- function(data, y){
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1 <- data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c(paste0("content_df1_", y))
return(df1)
}
runResultsCalc <- function(df, y){
df2 <- (1/5) * df$Income_gender + df$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c(paste0("content_df2_", y)
return(df2)
}
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
Now if you need to source across scripts. Create same two functions, runIncomeCalc and runResultsCalc in Mycalculus.R and then call each in other script:
library(dplyr)
library(GiniWegNeg)
if(!exists("runIncomeCalc", mode="function")) source("Mycalculus.R")
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
If you functional-ize your steps you can create a workflow like the following:
calcFunc <- function(df) {
## Do something to the df, then return it
df
}
processFunc <- function(fname) {
## Read in your table
x <- read.table(fname)
## Do the calculation
x <- calcFunc(x)
## Make a new file name (remember to change the file extension)
new_fname <- sub("something", "else", fname)
## Write the .RData file
save(x, file = new_fname)
}
### Your workflow
## Generate a vector of files
my_files <- list.files()
## Do the work
res <- lapply(my_files, processFunc)
Alternatively, don't save the files. Omit the save call in the processFunc, and return a list of data.frame objects. Then use either data.table::rbindlist(res) or do.call(rbind, list) to make one large data.frame object.

Resources