I have a list of 701 given csv files. Each one has the same number of columns (7) but different number of rows (between 25000 and 28000).
Here is an extract of the first file:
Date,Week,Week Day,Hour,Price,Volume,Sale/Purchase
18/03/2011,11,5,1,-3000.00,17416,Sell
18/03/2011,11,5,1,-1001.10,17427,Sell
18/03/2011,11,5,1,-1000.00,18055,Sell
18/03/2011,11,5,1,-500.10,18057,Sell
18/03/2011,11,5,1,-500.00,18064,Sell
18/03/2011,11,5,1,-400.10,18066,Sell
18/03/2011,11,5,1,-400.00,18066,Sell
18/03/2011,11,5,1,-300.10,18068,Sell
18/03/2011,11,5,1,-300.00,18118,Sell
Now I am trying to plot the coefficients of my following regression (in the price intervall -50 and 150) of the supply curve for the ninth hour over one year.
First I made the regression:
allenamen <- dir(pattern="*.csv")
alledat <- lapply(allenamen, read.csv, header = TRUE, sep = ",", stringsAsFactors = FALSE)
h <- list()
for(i in 1:length(alledat)){
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i]] <- coef(f)
}
h.df <- setNames(do.call(rbind.data.frame, h), names(h[[1]]))
Then I just took the datas of the supply curve and the ninth hour and changed the format of the date:
files <- list.files(pattern="*.csv")
df <- data.frame()
for(i in 1:length(files)) {
xx <- read.csv(as.character(files[i]))
xx <- subset(xx, Sale.Purchase == "Sell" & Hour == 9)
df <- rbind(df, xx)
}
df$Date <- as.Date(as.character(df$Date), format="%d/%m/%Y")
And then I tried to plot the coefficient a:
plot(h.df$a ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
But I get this error:
Error in (function (formula, data = NULL, subset = NULL, na.action = na.fail, :
variable lengths differ (found for 'Date')
Thank you for your help!
Related
I am having 2 sets of raster data and their names are:
ntl_'a number'.tif
pop_'a number'.tif
My goal is to create a function that reads the first pair of rasters (e.g., ntl_1.tif and pop_1.tif), then executes the below code and then repeats the process with the next pair:
library(raster)
library(DescTools)
#create a data.frame of values from the NTL and pop raster data
ntl = raster("path/ntl_1.tif")
vals_ntl <- as.data.frame(values(ntl))
ntl_coords = as.data.frame(xyFromCell(ntl, 1:ncell(ntl)))
combine <- as.data.frame(cbind(ntl_coords,vals_ntl))
pop<-raster("path/pop_1.tif")
pop = resample(pop, ntl, method = 'bilinear')
vals_pop <- as.data.frame(values(pop))
block.data <- as.data.frame(cbind(combine, vals_pop))
names(block.data)[3] <- "ntl"
names(block.data)[4] <- "pop"
block.data <- na.omit(block.data)
block.data = subset(block.data, select = -c(x, y))
# sort by ntl
block.data <-block.data[order(block.data$ntl),]
ntl_vector <- block.data[ , "ntl"]
pop_vector <- block.data[ , "pop"]
#compute gini index
Gini(ntl_vector, pop_vector, unbiased = FALSE)
My issue is with the code inside the function, I do not know how to properly make the syntax (the above code is for a pair of raster while I have hundreds of pairs). Hopefully I can get the results (i.e., the gini coefficient) of every pair in my console or, even better, in a data.frame. The data are here.
library(purrr)
library(fs)
raster_gini <- function(
.ntl = "ntl_1.tif",
.pop = "pop_1.tif",
.rdgal = TRUE
) {
if(.rdgal) {
ntl = raster(.ntl)
vals_ntl <- as.data.frame(values(ntl))
ntl_coords = as.data.frame(xyFromCell(ntl, 1:ncell(ntl)))
combine <- as.data.frame(cbind(ntl_coords,vals_ntl))
pop<-raster(.pop)
pop = resample(pop, ntl, method = 'bilinear')
vals_pop <- as.data.frame(values(pop))
block.data <- as.data.frame(cbind(combine, vals_pop))
#rename the columns
names(block.data)[3] <- "ntl"
names(block.data)[4] <- "pop"
#remove NA values
block.data <- na.omit(block.data)
#remove the columns x & y
block.data = subset(block.data, select = -c(x, y))
# sort by ntl
block.data <-block.data[order(block.data$ntl),]
ntl_vector <- block.data[ , "ntl"]
pop_vector <- block.data[ , "pop"]
#compute gini index
gini <- Gini(ntl_vector, pop_vector, unbiased = FALSE)
c(ntl = .ntl, pop = .pop, gini = gini)
} else {
c(ntl = .ntl, pop = .pop)
}
}
doc_paths_ntl <- fs::dir_ls("path_to_ntl_raster", glob = "*tif*")
doc_paths_pop <- fs::dir_ls("path_to_pop_raster", glob = "*tif*")
result_df <- purrr::map2_dfr(.x = doc_paths_ntl, .y = doc_paths_pop, .f = raster_gini)
result_df <- result_df |>
dplyr::mutate(ntl = basename(ntl)) |>
dplyr::mutate(pop = basename(pop))
result_df
How can you iterate in a for loop with specific column names in R? This is the dataset I am using and below are the names of the columns I want to iterate. Also are the column number.
When I try to iterate, it does not compile. I need this to create a multiple cluster data visualization.
if (!require('Stat2Data')) install.packages('Stat2Data')
library(Stat2Data)
data("Hawks")
#summary(Hawks)
for (i in 10:13(Hawks)){
print(Hawks$ColumnName)
}
for (i in Hawks(c("Wing","Weight","Culmen","Hallux"))){
print(Hawks$ColumnName)
}
EDIT
After what Martin told me, this error occurs:
Error in [.data.frame`(Hawks, , i) : undefined columns selected
This is the code I have:
if(!require('DescTools')) {
install.packages('DescTools')
library('DescTools')
}
Hawks$Wing[is.na(Hawks$Wing)] <- mean(Hawks$Wing, na.rm = TRUE)
Hawks$Weight[is.na(Hawks$Weight)] <- mean(Hawks$Weight, na.rm = TRUE)
Hawks$Culmen[is.na(Hawks$Culmen)] <- mean(Hawks$Culmen, na.rm = TRUE)
Hawks$Hallux[is.na(Hawks$Hallux)] <- mean(Hawks$Hallux, na.rm = TRUE)
# ParĂ¡metro Wing
n <- nrow(Hawks) # Number of rows
for (col_names in 10:13){
x <- matrix(Hawks[, i],0.95*n)
#x <- rbind(x1,x2)
plot (x)
fit2 <- kmeans(x, 2)
y_cluster2 <- fit2$cluster
fit3 <- kmeans(x, 3)
y_cluster3 <- fit3$cluster
fit4 <- kmeans(x, 4)
y_cluster4 <- fit4$cluster
}
I have a data frame as "df" and 41 variables var1 to var41. If I write this command
pcdtest(plm(var1~ 1 , data = df, model = "pooling"))[[1]]
I can see the test value. But I need to apply this test 41 times. I want to access variable by column number which is "df[1]" for "var1" and "df[41]" for "var41"
pcdtest(plm(df[1]~ 1 , data = dfp, model = "pooling"))[[1]]
But it fails. Could you please help me to do this? I will have result in for loop. And I will calculate the descriptive statistics for all the results. But it is very difficult to do test for each variable.
I think you can easily adapt the following code to your data. Since you didn't provide any of your data, I used data that comes with the plm package.
library(plm) # for pcdtest
# example data from plm package
data("Cigar" , package = "plm")
Cigar[ , "fact1"] <- c(0,1)
Cigar[ , "fact2"] <- c(1,0)
Cigar.p <- pdata.frame(Cigar)
# example for one column
p_model <- plm(formula = pop~1, data = Cigar.p, model = "pooling")
pcdtest(p_model)[[1]]
# run through multiple models
l_plm_models <- list() # store plm models in this list
l_tests <- list() # store testresults in this list
for(i in 3:ncol(Cigar.p)){ # start in the third column, since the first two are state and year
fmla <- as.formula(paste(names(Cigar.p)[i], '~ 1', sep = ""))
l_plm_models[[i]] <- plm(formula = as.formula(paste0(colnames(Cigar.p)[i], "~ 1", sep = "")),
data = Cigar.p,
model = "pooling")
l_tests[[i]] <- pcdtest(l_plm_models[[i]])[[1]]
}
testresult <- data.frame("z" = unlist(l_tests), row.names = (colnames(Cigar.p[3:11])))
> testresult
z
price 175.36476
pop 130.45774
pop16 155.29092
cpi 176.21010
ndi 175.51938
sales 99.02973
pimin 175.74600
fact1 176.21010
fact2 176.21010
# example for cipstest
matrix_results <- matrix(NA, nrow = 11, ncol = 2) # use 41 here for your df
l_ctest <- list()
for(i in 3:ncol(Cigar.p)){
l_ctest[[i]] <- cipstest(Cigar.p[, i], lags = 4, type = 'none', model = 'cmg', truncated = F)
matrix_results[i, 1] <- as.numeric(l_ctest[[i]][1])
matrix_results[i, 2] <- as.numeric(l_ctest[[i]][7])
}
res <- data.frame(matrix_results)
names(res) <- c('cips-statistic', 'p-value')
print(res)
Try using as.formula(), for example:
results <- list()
for (i in 1:41){
varName <- paste0('var',i)
frml <- paste0(varName, ' ~ 1')
results[[i]] <-
pcdtest(plm(as.formula(frml) , data = dfp, model = "pooling"))[[1]]
}
You can use reformulate to create the formula and apply the code for 41 times using lapply :
var <- paste0('var', 1:41)
result <- lapply(var, function(x) pcdtest(plm(reformulate('1', x),
data = df, model = "pooling"))[[1]])
I have daily rainfall data which I have converted to yearwise cumulative value using following code
library(tidyverse); library(segmented); library(seas); library(SiZer)
## get mscdata from "seas" packages
data(mscdata)
dat <- (mksub(mscdata, id=1108447))
dat$julian.date <- as.numeric(format(dat$date, "%j"))
## generate cumulative sum of rain by year
df <- dat %>% group_by(year) %>% mutate(rain_cs = cumsum(rain)) %>% ungroup
Then I want to divide every year into 2 parts (before 210 days and after 210 days) then apply piecewise linear model from SiZer to identify yearwise breakpoints. I could able to do it for single year like
data <- subset(df, year == 1975)
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
Now how to dynamically fit piecewise linear model for all the years?
You can try using a function and base R, creating a list and then saving the models. I include in last line a way to export all models outside the list:
library(tidyverse); library(segmented); library(seas); library(SiZer)
## get mscdata from "seas" packages
data(mscdata)
dat <- (mksub(mscdata, id=1108447))
dat$julian.date <- as.numeric(format(dat$date, "%j"))
## generate cumulative sum of rain by year
df <- dat %>% group_by(year) %>% mutate(rain_cs = cumsum(rain)) %>% ungroup
#Create list
Listyear <- split(df,df$year)
#Function for year process
model_function<-function(x)
{
data <- x
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
#Group elements
list.model <- list(v1=sub1.mod,v2=sub2.mod)
names(list.model)<-paste0(c("sub.mod1.","sub.mod2."),unique(x$year))
return(list.model)
}
#Iterate over all models
z1 <- lapply(Listyear,model_function)
#Export elements to envir
lapply(z1,list2env,.GlobalEnv)
You will end up with z1:
$`1975`
$`1975`$sub.mod1.1975
[1] "Threshold alpha: 85.0000277968913"
[1] ""
[1] "Model coefficients: Beta[0], Beta[1], Beta[2]"
(Intercept) x w
26.730070 3.376754 -2.406744
Change.Point Initial.Slope Slope.Change Second.Slope
2.5% 82.87297 3.259395 -2.515015 0.9283611
97.5% 87.90540 3.478656 -2.273062 1.0153773
$`1975`$sub.mod2.1975
[1] "Threshold alpha: 274.000071675723"
[1] ""
[1] "Model coefficients: Beta[0], Beta[1], Beta[2]"
(Intercept) x w
-37.968273 2.150220 5.115431
Change.Point Initial.Slope Slope.Change Second.Slope
2.5% 272.0000 1.969573 4.750341 7.057207
97.5% 276.0001 2.371539 5.468130 7.504963
And by running last line you will get the models in the global environment:
I hope this can help.
Code for exporting to csv.
I include an additional function that takes some results from the models and creates dataframes so that it can be easily exported to .csv after doing some adjusts to lists. The function is next:
model_export<-function(x)
{
data <- x
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
#Group elements for models
#Model 1
modelname <- rep('sub1.mod',2)
year <- rep(unique(x$year),2)
changepoint <- rep(sub1.mod$change.point,2)
coefs <- as.data.frame(t(sub1.mod$model$coefficients))
intervals <- as.data.frame(sub1.mod$intervals)
intervals <- cbind(data.frame(confidence=rownames(intervals)),intervals)
rownames(intervals)<-NULL
#Build DF
DF1 <- data.frame(modelname,year,changepoint,coefs,intervals)
#Model 2
modelname <- rep('sub2.mod',2)
changepoint <- rep(sub2.mod$change.point,2)
coefs <- as.data.frame(t(sub2.mod$model$coefficients))
intervals <- as.data.frame(sub2.mod$intervals)
intervals <- cbind(data.frame(confidence=rownames(intervals)),intervals)
rownames(intervals)<-NULL
#Build DF
DF2 <- data.frame(modelname,year,changepoint,coefs,intervals)
#Bind DFs
DFG <- rbind(DF1,DF2)
return(DFG)
}
Then you can apply:
#Apply new function to list
z2 <- lapply(Listyear,model_export)
#DF to export
MyDF <- do.call(rbind,z2)
#Export
write.csv(MyDF,file='Myfile.csv')
I have used it for two years having the results saved in MyDF and then exported to .csv file. Just as consideration if rbind would not work for any reason you could try rbind.fill() from plyr package.
I am trying to sum values that are greater than 70 in several different data sets. I believe that aggregate can do this but my research has not pointed to an obvious solution to obtaining the values that exceed seventy in my data sets. I have first used aggregate to get the daily max values and put these values into the data frame called yearmaxs. Here is my code and what I have tried:
number of times O3 >70 in a year per site
Sys.setenv(TZ = "UTC")
library(openair)
library(lubridate)
filedir <- "C:/Users/dfmcg/Documents/Thesisfiles/8hravg"
myfiles <- c(list.files(path = filedir))
paste(filedir, myfiles, sep = '/')
npsfiles <- c(paste(filedir, myfiles,sep = '/'))
for (i in npsfiles[22]) {
x <- substr(i,45,61)
y <- paste('C:/Users/dfmcg/Documents/Thesisfiles/exceedenceall', x, sep='/')
timeozone <- import(i, date="DATES", date.format = "%Y-%m-%d %H", header=TRUE, na.strings="NA")
overseventy <- c()
yearmaxs <- aggregate(rolling.O3new ~ format(as.Date(date)), timeozone, max)
colnames(yearmaxs) <- c("date", "daymax")
overseventy <- aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
colnames(overseventy) <- c("date", "daymax")
aggregate(daymax ~ format(as.Date(date), "%Y"), overseventy, sum)
I have also tried: sum > "70 and sum(daymax > "70).
My other idea at this point is using a for loop to iterate through the values. I was hoping that a could use aggregate again to sum the values of interest. Any help at all would be greatly appreciated!
I think you want:
aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
To things:
you need numerical comparison, so use as.numeric(daymax) > 70 not daymax > "70";
use the subset argument in aggregate.formula.