Plotting multiple graphs from a list - r

I have a time series data from 1990 to 1994 of 5 variables in 15 sheets. I read all these data to a list. I need to do a time series plot of all the 5 Variables for 15 companies in multiple graphs. How can this be done? I mean I basically need 5 figures each containing the time series plot of 15 companies of the respective variables.

With package ggplot2 this can be done as follows. I assume you have a list of 15 dataframes named df_list.
First, rbind them together with the name of the company as a new column. The companies' names are in this fake data case stored as the df's names.
all_df <- lapply(names(df_list), function(x){
DF <- df_list[[x]]
DF$Company <- x
DF
})
all_df <- do.call(rbind, all_df)
Then, reshape from wide to long format.
long_df <- reshape2::melt(all_df, id.vars = c("Company", "Date"))
Now, plot them. The graphs can be customized at will, there are many posts on it.
library(ggplot2)
ggplot(long_df, aes(x = Date, y = value, colour = Company)) +
geom_line() +
facet_wrap(~ variable)
Data creation code.
set.seed(1234)
Dates <- seq(as.Date("1990-01-01"), as.Date("1994-12-31"), by = "month")
n <- length(Dates)
df_list <- lapply(1:15, function(i){
tmp <- matrix(rnorm(5*n), ncol = 5)
tmp <- apply(tmp, 2, cumsum)
colnames(tmp) <- paste0("Var", 1:5)
tmp <- as.data.frame(tmp)
tmp$Date <- Dates
tmp
})
names(df_list) <- paste("Company", seq_along(df_list), sep = ".")

Related

Looping through values to make changes in data frame

I have a code that makes some changes in a dataframe.
value <- iris[1:120,]
cngfunc <- function(day,howmany,howmuch){
shuffled= day[sample(1:nrow(day)), ]
n = as.integer((howmany/100)*nrow(day)) #select percentage of data to be changed
extracted <- shuffled[1:n, ]
extracted$changed <- extracted[,1]*((howmuch/100)+1) #how much the data changes
extracted}
cngfunc(value,10,20)
Now I want to loop through the values of howmany and howmuch.
For example, howmuch <- c(10,20,30,40,50) and howmany <- c(10,20,30,40,50)
So the first result would be for cngfunc(value,10,10), cngfunc(value,10,20),cngfunc(value,10,30)....and cngfunc(value,20,10), cngfunc(value,20,20), and so on such that I'll have 25 different data frame.
Is there a way to do that?
You can do it with expand.grid to get all of the combinations, and the a map2 to create a list of dataframes:
library(tidyverse)
combos <- expand.grid(c(10,20,30,40,50), c(10,20,30,40,50))
result <- map2(combos$Var1, combos$Var2, function(x, y) cngfunc(value, x, y)) %>%
setNames(tidyr::unite(combos, Var, Var1:Var2, sep = "-")$Var)
Not sure where you are getting 120 dataframes from, as 5 * 5 = 25. This should be the general idea though.

Saving Multiple Outputs from a for loop in r

I am trying to use a for loop to multiple create dataframes. The original code works fine for a single run (without the for loop).
object<- c(1,2,3)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
N.rows <- length(object)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(df1,df2,df3))
When I do this I get combined[3] but not the outputs from the two other variables in my object. I have toyed around with it and managed to get as a result combined1, but again not a list with combined1, combined[2], and combined[3].
UPDATE: I was asked for the concrete example and expected output.
I'm basically taking three CSV files of county census data but organized differently (two with years as rows, and one with years as columns), transforming the data into a consistent format by county and then combining the files.
The above image is the result of View(combine). [[2]] is just what I want, but nothing is stored in [1].
This is the code that I used to get to it:
pop1990.2000 <- read.csv("1990-2000 Census Pop.csv",
stringsAsFactors = FALSE)
pop2000.2010 <- read.csv("2000-2010 Census Pop.csv",
stringsAsFactors = FALSE)
pop2010.2019 <- read.csv("2010-2019 Census Pop.csv",
stringsAsFactors = FALSE)
#Adding Total column "Population"
pop1990.2000$Population <- (rowSums(pop1990.2000) -
pop1990.2000$Year -
pop1990.2000$FIPS.Code)
#Combining State and County FIPS codes "FIPS.Code"
pop2000.2010$FIPS.Code <- (pop2000.2010$STATE*1000+
pop2000.2010$COUNTY)
pop2010.2019$FIPS.Code <- (pop2010.2019$STATE*1000+
pop2010.2019$COUNTY)
my_counties<-c(1125, 1127)
for (i in 1:length(my_counties)) {
#Selecting Pop data for County 1125 for 1990-2000
newdata <- pop1990.2000[ which(pop1990.2000$FIPS.Code==my_counties[i]), ]
newdata2000v1 <- as.data.frame(cbind(Year=newdata$Year,
Population=newdata$Pop))
#Adding FIPs Code
newdata2000v1$FIPS.Code<-my_counties[i]
#Merging County Name by FIPS.Code
pop2000.2010.c.fips <- pop2000.2010 %>%
select(FIPS.Code, CTYNAME)
pop2000.2010.c.fips$County<-pop2000.2010.c.fips$CTYNAME
newdata2000v1 <- newdata2000v1 %>%
mutate(FIPS.Code = as.numeric(FIPS.Code))
newdata2000 <- left_join(newdata2000v1,
pop2000.2010.c.fips,
by = "FIPS.Code")
newdata2000<-newdata2000 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2000-2010
newdata2 <- pop2000.2010[ which(pop2000.2010$FIPS.Code==my_counties[i]), ]
newdata2010 <- cbind("2000"=newdata2$ESTIMATESBASE2000,
"2001"=newdata2$POPESTIMATE2001,
"2002"=newdata2$POPESTIMATE2002,
"2003"=newdata2$POPESTIMATE2003,
"2004"=newdata2$POPESTIMATE2004,
"2005"=newdata2$POPESTIMATE2005,
"2006"=newdata2$POPESTIMATE2006,
"2007"=newdata2$POPESTIMATE2007,
"2008"=newdata2$POPESTIMATE2008,
"2009"=newdata2$POPESTIMATE2009)
newdata2010<-as.data.frame(t(newdata2010))
newdata2010$County<-newdata2$CTYNAME
newdata2010$FIPS.Code<-newdata2$FIPS.Code
newdata2010$Year<-c(rownames(newdata2010))
names(newdata2010)[names(newdata2010) == 'V1'] <- 'Population'
newdata2010<-newdata2010 %>% select(County, FIPS.Code, Year, Population)
#Selecting Pop data for County 1125 for 2010-2019
newdata3 <- pop2010.2019[ which(pop2010.2019$FIPS.Code==my_counties[i]), ]
newdata2019 <- cbind(Year=newdata3$Year,
"2010"=newdata3$CENSUS2010POP,
"2011"=newdata3$POPESTIMATE2011,
"2012"=newdata3$POPESTIMATE2012,
"2013"=newdata3$POPESTIMATE2013,
"2014"=newdata3$POPESTIMATE2014,
"2015"=newdata3$POPESTIMATE2015,
"2016"=newdata3$POPESTIMATE2016,
"2017"=newdata3$POPESTIMATE2017,
"2018"=newdata3$POPESTIMATE2018,
"2019"=newdata3$POPESTIMATE2019)
newdata2019<-as.data.frame(t(newdata2019))
newdata2019$County<-newdata3$CTYNAME
newdata2019$FIPS.Code<-newdata3$FIPS.Code
newdata2019$Year<-c(rownames(newdata2019))
names(newdata2019)[names(newdata2019) == 'V1'] <- 'Population'
newdata2019<-newdata2019 %>% select(County, FIPS.Code, Year, Population)
N.rows <- length(my_counties)
combined <- vector("list", N.rows)
combined[i]<-list(rbind(newdata2000,newdata2010,newdata2019))
The problem is that you are re-specifying the creation of the combined object.
I am not sure what exactly your var1 is, but, possibly the following should work:
object<- c(1,2,3)
N.rows <- length(var1)
combined <- vector("list", N.rows)
for (i in 1:length(object)) {
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
combined[i]<-list(rbind(df1,df2,df3))
}
Alternatively, using lapply:
object<- c(1,2,3)
combined<-lapply(object, function(i){
df1<- SomeFunction1(object[i])
df2<- SomeFunction2(object[i])
df3<- SomeFunction3(object[i])
list(rbind(df1,df2,df3))
}
But this will deliver the list of length 3 (with three lists with df1,df2 and df3), not the length defined by the length of var1...

Reading Excel file into R with operator in tab name

I am reading a file with different tabs into R. However, they changed the tab names so they contain operators now, which R doesnt seem to like. For instance (and this is where the code occurs) "Storico_G1" became "Storico_G+1".
I post the code below, but the error occurs early on. I am basically looking for a workaround/to change the tab names before i create data.frames.
NB I left the code as it was before they changed the tab name from "Storico_G1" to "Storico_G+1" as I think its easier to grasp this way.
Can anybody guide me in the right direction? Many thanks in advance!
library(ggplot2)
library(lubridate)
library(openxlsx)
library(reshape2)
library(dplyr)
library(scales)
Storico_G <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx",sheet = "Storico_G", startRow = 1, colNames = TRUE)
Storico_G1 <- read.xlsx(xlsxFile = "http://www.snamretegas.it/repository/file/it/business-servizi/dati-operativi-business/dati_operativi_bilanciamento_sistema/2018/DatiOperativi_2018-IT.xlsx", startRow = 1, colNames = TRUE)
# Selecting Column C,E,R from Storico_G and stored in variable Storico_G_df
# Selecting Column A,P from Storico_G+1 and stored in variable Storico_G1_df
Storico_G_df <- data.frame(Storico_G$pubblicazione,Storico_G$IMMESSO, Storico_G$`RICONSEGNATO.(1)`, Storico_G$BILANCIAMENTO.RESIDUALE )
Storico_G1_df <- data.frame(Storico_G1$pubblicazione, Storico_G1$`SBILANCIAMENTO.ATTESO.DEL.SISTEMA.(SAS)`)
# Conerting pubblicazione in date format and time
Storico_G_df$pubblicazione <- ymd_h(Storico_G_df$Storico_G.pubblicazione)
Storico_G1_df$pubblicazione <- ymd_h(Storico_G1_df$Storico_G1.pubblicazione)
# Selecting on row which is having 4PM value in Storico_G+1 excel sheet tab
Storico_G1_df <- subset(Storico_G1_df, hour(Storico_G1_df$pubblicazione) == 16)
rownames(Storico_G1_df) <- 1:nrow(Storico_G1_df)
# Averaging hourly values to 1 daily data point in G excel sheet tab
Storico_G_df$Storico_G.pubblicazione <- strptime(Storico_G_df$Storico_G.pubblicazione, "%Y_%m_%d_%H")
storico_G_df_agg <- aggregate(Storico_G_df, by=list(day=format(Storico_G_df$Storico_G.pubblicazione, "%F")), FUN=mean, na.rm=TRUE)[,-2]
#cbind.fill function
cbind.fill <- function(...){
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
#cbind with both frames
G_G1_df= data.frame(cbind.fill(storico_G_df_agg,Storico_G1_df))
#keep required columns
keep=c("day", "Storico_G.IMMESSO","Storico_G..RICONSEGNATO..1..","Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..")
#update dataframe to kept variables
G_G1_df=G_G1_df[,keep,drop=FALSE]
#Rename crazy variable names
G_G1_df <- data.frame(G_G1_df) %>%
select(day, Storico_G.IMMESSO, Storico_G..RICONSEGNATO..1.., Storico_G1..SBILANCIAMENTO.ATTESO.DEL.SISTEMA..SAS..)
names(G_G1_df) <- c("day", "Immesso","Riconsegnato", "SAS")
#Melt time series
G_G1_df=melt(G_G1_df,id.vars = "day")
#Create group variable
G_G1_df$group<- ifelse(G_G1_df$variable == "SAS", "SAS", "Immesso/Consegnato")
#plot
ggplot(G_G1_df, aes(as.Date(day),as.numeric(value),col=variable))+geom_point()+geom_line()+facet_wrap(~group,ncol=1,scales="free_y")+labs(x="Month", y="Values") +scale_x_date(labels=date_format("%m-%Y"))+geom_abline(intercept=c(-2,0,2),slope=0,data=subset(G_G1_df,group=="SAS"),lwd=0.5,lty=2)

Changing dataframe column names in R groups at a time

Suppose I have a data frame (DF) that looks like the following:
test <- c('Test1','Test2','Test3')
col.DF.names < c('ID', 'year', 'car', 'age', 'year.1', 'car.1', 'age.1', 'year.2', 'car.2', 'age.2')
ID <- c('A','B','C')
year <- c(2001,2002,2003)
car <- c('acura','benz','lexus')
age <- c(55,16,20)
year.1 <- c(2011,2012,2013)
car.1 <- c('honda','gm','bmw')
age.1 <- c(43,21,34)
year.2 <- c(1961,1962,1963)
car.2 <- c('toyota','porsche','jeep')
age.2 <- c(33,56,42)
DF <- data.frame(ID, year, car, age, year.1, car.1, age.1, year.2, car.2, age.2)
I need the columns of data frame to lose the ".#" and instead have the Test# in front of it, so it looks something like this:
ID Test1.year Test1.car Test1.age Test2.year Test2.car Test2.age Test3.year Test3.car Test3.age
.... with all the data
Does anyone have a suggestion? Basically, starting at the second column, I"d like to add the test[1] name for 3 columns, and then move to the next set of three columns and add test[2] and so on..
I know how to hard code it:
colnames(DF)[2:4] <- paste(test[1], colnames(DF)[2:4], sep = ".")
but this is a toy set, and I would like to somewhat automate it, so I'm not specifically indicating[2:4] for example.
You could try:
colnames(DF)[-1] <- paste(sapply(test, rep, 3), colnames(DF)[-1], sep = ".")
or perhaps the following would be better:
colnames(DF)[-1] <- paste(sapply(test, rep, 3), colnames(DF)[2:4], sep = ".")
or:
colnames(DF)[-1] <- paste(rep(test, each=3), colnames(DF)[2:4], sep = ".")
thanks to #thelatemail

Loop over a subset, source a file and save results in a dataframe

Similar questions have been asked already but none was able to solve my specific problem. I have a .R file ("Mycalculus.R") containing many basic calculus that I need to apply to subsets of a dataframe: one subset for each year where the modalities of "year" are factors (yearA, yearB, yearC) not numeric values. The file generates a new dataframe that I need to save in a Rda file. Here is what I expect the code to look like with a for loop (this one obviously do not work):
id <- identif(unlist(df$year))
for (i in 1:length(id)){
data <- subset(df, year == id[i])
source ("Mycalculus.R", echo=TRUE)
save(content_df1,file="myresults.Rda")
}
Here is an exact of the main data.frame df:
obs year income gender ageclass weight
1 yearA 1000 F 1 10
2 yearA 1200 M 2 25
3 yearB 1400 M 2 5
4 yearB 1350 M 1 11
Here is what the sourced file "Mycalculus.R" do: it applies numerous basic calculus to columns of the dataframe called "data", and creates two new dataframes df1 and then df2 based on df1. Here is an extract:
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
library(GiniWegNeg)
gini=c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1=data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c("content_df1")
df2=(1/5)*df1$Income_gender+df2$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c("content_df2")
So that in the end, I get two dataframes like this:
Income_Gender Income_Ageclass
content_df1 .... ....
And for df2:
myresult
content_df2 ....
But I need to save df1 and Rf2 as a Rda file where the row names of content_df1 and content_df2 are given per subset, something like this:
Income_Gender Income_Ageclass
content_df1_yearA .... ....
content_df1_yearB .... ....
content_df1_yearC .... ....
and
myresult
content_df2_yearA ....
content_df2_yearB ....
content_df2_yearC ....
Currently, my program does not use any loop and is doing the job but messily. Basically the code is more than 2500 lines of code. (please don't throw tomatoes at me).
Anyone could help me with this specific request?
Thank you in advance.
Consider incorporating all in one script with a defined function of needed arguments, called by lapply(). Lapply then returns a list of dataframes that you can rowbind into one final df.
library(dplyr)
library(GiniWegNeg)
runIncomeCalc <- function(data, y){
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1 <- data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c(paste0("content_df1_", y))
return(df1)
}
runResultsCalc <- function(df, y){
df2 <- (1/5) * df$Income_gender + df$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c(paste0("content_df2_", y)
return(df2)
}
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
Now if you need to source across scripts. Create same two functions, runIncomeCalc and runResultsCalc in Mycalculus.R and then call each in other script:
library(dplyr)
library(GiniWegNeg)
if(!exists("runIncomeCalc", mode="function")) source("Mycalculus.R")
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
If you functional-ize your steps you can create a workflow like the following:
calcFunc <- function(df) {
## Do something to the df, then return it
df
}
processFunc <- function(fname) {
## Read in your table
x <- read.table(fname)
## Do the calculation
x <- calcFunc(x)
## Make a new file name (remember to change the file extension)
new_fname <- sub("something", "else", fname)
## Write the .RData file
save(x, file = new_fname)
}
### Your workflow
## Generate a vector of files
my_files <- list.files()
## Do the work
res <- lapply(my_files, processFunc)
Alternatively, don't save the files. Omit the save call in the processFunc, and return a list of data.frame objects. Then use either data.table::rbindlist(res) or do.call(rbind, list) to make one large data.frame object.

Resources