Mutate statement not working to enable graph - r

Working through a tidyverse time series example on the sunspot data, getting the following error message when I try to graph in 10 year increments, seems to not recognize the new date variable created in step 1, wondering if this is a format issue.
Error: Problem with mutate() column decade.
i decade = f(date).
x subscript out of bounds
Appreciate any help
# libraries
library(datasets)
library(tidyverse)
library(tsibble)
library(ggplot)
# Tidy the data
tidy_ts <- sunspots %>%
as_tsibble() %>% # Convert to timeseries tibble
mutate( # Create new variables
year = year(index), # Create a year column
month = month(index) # Create a month column
) %>%
select( # Select, reorder, rename vars
date = index, # Rename "index" to "date"
year, # Use "year" as second variable
month, # Use "month" as third variable
spots = value # Rename "value" as "spots"
) %>%
print() # Show data
view(tidy_ts)
# Graph the tidy data by decade
tidy_ts %>%
index_by( # By decade
decade = ~ floor_date(tidy_ts$date, years(10))
) %>%
summarise(mean_s = mean(spots)) %>% # Mean for decade
ggplot(aes(decade, mean_s)) + # Plot means
geom_point() + # Scatterplot
geom_smooth() + # Smoother
ylab("Sunspots: Mean by Decade") # Label

Try replacing:
index_by( # By decade
decade = ~ floor_date(tidy_ts$date, years(10))
)
with:
index_by(decade = floor(year/10)*10)

Related

R - Looping linear regression results for time series

I want to run linear regressions on the NZD vs a number of securities
I have some code to runs the regression but rather than apply it to each security i would prefer to run a loop through the list of securities to give me a file with the r^2 results from each linear regression
my dep variable is called: nzdusd
independent variables I would like to loop through are spx, adxy, vix
Code: as it currently stands with spx (like to use the same code to loop it through for variables adxy and vix as well)
library(tseries)
library(lmtest)
library(dplyr)
library(lubridate)
# 3 month regression, change variable here to get number of days
# e.g. 3 months sd = 60
# inputs
# 3 month regression
sd <- 60
# loading my market data from a saved location (variables nzdusd,spx, adxy, vix)
my_path <- file.path ("K:","X,"bbg_daily.Rdata")
load(file = my_path)
# Transform NZD into percentage change
pct.nzdusd <- nzdusd %>%
select(date, PX_LAST) %>%
mutate(lag = lag(PX_LAST),
pct_chg = (PX_LAST - lag) * 100 / lag) %>%
select(date, pct_chg)
# SPX(S&P 500)
myfun <- function(x) {
deparse(substitute(x))
}
# ^=^=^=^=^=^=^=^=^=^=^=^=^=^=
mysec_str <- myfun(spx)
mysec <- spx
z <- 5 # Series ID
# ^=^=^=^=^=^=^=^=^=^=^=^=^=^=
# Transform into percentage change
mypct <- mysec %>%
select(date, PX_LAST) %>%
mutate(lag = lag(PX_LAST),
pct_chg = (PX_LAST - lag) * 100 / lag) %>%
select(date, pct_chg)
assign(paste("pct.", mysec_str, sep = ""),mypct)
# join times series
ts <- paste("ts_", z, sep ="")
ts <- (inner_join(x = pct.nzdusd, y = mypct, by = "date"))
# get last row
last_row <- ts %>% slice(n())
end_dt <- last_row [1,1]
# start date declared above depending on regression
start_dt <- ts[((nrow (ts))-sd),1]
# getting subset of time series
ts_sub <- subset(ts,
date >= as.POSIXct(start_dt) &
date <= as.POSIXct(end_dt))
# regression
reg.ts = lm(pct_chg.x~pct_chg.y, ts_sub)
r2 <- summary(reg.ts)$r.squared
assign(paste(mysec_str, ".r2", sep = ""),r2)
stderr <- sqrt(deviance(reg.ts)/df.residual(reg.ts))
assign(paste(mysec_str, ".stderr", sep = ""),stderr)
#===================================================
r2 <- c(spx.r2, *adxy.r2, vix.r2*)
my_path2 <- file.path ("K:","x")
save (r2, file = my_path2 )
I've done code by simply copying and pasting and then replacing spx with the other variable names. But i know the code can be a lot slicker by using a loop. Particularily if I want to add a lot more independent variables
It's hard to known without reprex data, but to run multiple models, I've found pivoting longer, nesting by independent variables and then mutating through those variables works well. If your data just contains your dependent and independent variables, you can:
library(tidyverse)
ts_sub %>%
# Keep independent variable outside nested data
pivot_longer(- nzdusd, names_to = "dependent_vars", values_to = "values") %>%
nest_by(dependent_vars) %>%
mutate(model = list(lm(nzdusd ~ values, data = data)))
See: https://dplyr.tidyverse.org/reference/nest_by.html

Plotting dummy variables with ggplot2

I actually need help building on this question:
ggplot2 graphic order by grouped variable instead of in alphabetical order.
I need to produce a similar graph and I actually have a problem with the black points. I have data where column names are dates and rows are filled with 0 or 1 and I need to plot the point if the value is 1. To reproduce, here is a small sample (in my dataset, there is over 300 columns):
df <- data.frame(id=c(1,2,3),
"26April1970"=c(0,0,1),
"14August1970"=c(0,1,0))
I need to plot the dates on the x axis, match the id to the canton and show the points where the value is 1.
Could anyone help?
Try this:
plot_data = df %>%
## put data in long format
pivot_longer(-id, names_to = "colname") %>%
## keep only 1s
filter(value == 1) %>%
## convert dates to Date class
mutate(date = as.Date(colname, format = "%d%B%Y"))
plot_data
# # A tibble: 2 x 4
# id colname value date
# <dbl> <chr> <dbl> <date>
# 1 2 14August1970 1 1970-08-14
# 2 3 26April1970 1 1970-04-26
## plot
ggplot(plot_data, aes(x = date, y = factor(id))) +
geom_point()
Using this data:
df <- data.frame(id=c(1,2,3),
"26April1970"=c(0,0,1),
"14August1970"=c(0,1,0), check.names = FALSE)
Maybe you are looking for this:
library(ggplot2)
library(dplyr)
library(tidyr)
#Data
df <- data.frame(id=c(1,2,3),
"26April1970"=c(0,0,1),
"14August1970"=c(0,1,0))
#Code
df %>% pivot_longer(-id) %>%
ggplot(aes(x=name,y=factor(value)))+
geom_point(aes(color=factor(value)))+
scale_color_manual(values=c('transparent','black'))+
theme(legend.position = 'none')+xlab('Date')+ylab('value')
Output:

Getting two different means in R using same numbers

I'm trying to calculate the mean of some grouped data, but I'm running into an issue where the mean generated using base::mean() is generating a different value than when I use base:rowMeans() or try to replicate the mean in Excel.
Here's the code with a simplified data frame looking at just a small piece of the data:
df <- data.frame("ID" = 1101372,
"Q1" = 5.996667,
"Q2" = 6.005556,
"Q3" = 5.763333)
avg1 <- df %>%
summarise(new_avg = mean(Q1,
Q2,
Q3)) # Returns a value of 5.99667
avg2 <- rowMeans(df[,2:4]) # Returns a value of 5.921852
The value in avg2 is what I get when I use AVERAGE in Excel, but I can't figure out why mean() is not generating the same number.
Any thoughts?
Here, the mean is taking only the first argument i.e. Q1 as 'x' because the usage for ?mean is
mean(x, trim = 0, na.rm = FALSE, ...)
i.e. the second and third argument are different. In the OP's code, x will be taken as "Q1", trim as "Q2" and so on.. The ... at the end also means that the user can supply n number of parameters without any error and leads to confusions like this (if we don't check the usage)
We can specify the data as ., subset the columns of interest and use that in rowMeans
df %>%
summarise(new_avg = rowMeans(.[-1]))
This would be more efficient. But, if we want to use mean as such, then do a rowwise
df %>%
rowwise() %>%
summarise(new_avg = mean(c(Q1, Q2, Q3)))
# A tibble: 1 x 1
# new_avg
# <dbl>
#1 5.92
Or convert to 'long' format and then do the group_by 'ID' and get the mean
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>% # can skip this step if there is only a single row
summarise(new_avg = mean(value))
# A tibble: 1 x 2
# ID new_avg
# <dbl> <dbl>
#1 1101372 5.92

apply function to grouped rows in dataframe [duplicate]

This question already has answers here:
Split dataframe using two columns of data and apply common transformation on list of resulting dataframes
(3 answers)
Closed 5 years ago.
I have created a function that computes a number of biological statistics, such as species range edges. Here is a simplified version of the function:
range_stats <- function(rangedf, lat, lon, weighting, na.rm=T){
cent_lat <- weighted.mean(x=rangedf[,lat], w=rangedf[,weighting], na.rm=T)
cent_lon <- weighted.mean(x=rangedf[,lon], w=rangedf[,weighting], na.rm=T)
out <- data.frame(cent_lat, cent_lon)
return(out)
}
I would like to apply this to a large dataframe where every row is an observation of a species. As such, I want the function to group rows by a specified set of columns, and then computer these statistics for each group. Here is a test dataframe:
LATITUDE <- c(27.91977, 21.29066, 26.06340, 28.38918, 25.97517, 27.96313)
LONGITUDE <- c(-175.8617, -157.8645, -173.9593, -178.3571, -173.9679, -175.7837)
BIOMASS <- c(4.3540488, 0.2406332, 0.2406332, 2.1419699, 0.3451426, 1.0946017)
SPECIES <- c('Abudefduf abdominalis','Abudefduf abdominalis','Abudefduf abdominalis','Chaetodon lunulatus','Chaetodon lunulatus','Chaetodon lunulatus')
YEAR <- c('2005', '2005', '2014', '2009', '2009', '2015')
testdf <- data.table(LATITUDE, LONGITUDE, BIOMASS, SPECIES, YEAR)
I want to apply this function to every unique combination of species and year to calculate summary statistics, i.e., the following:
testresult <- testdf %>%
group_by(SPECIES, YEAR) %>%
range_stats(lat="LATITUDE",lon="LONGITUDE",weighting="BIOMASS",na.rm=T)
However, the code above does not work (I get a (list) object cannot be coerced to type 'double' error) and I am not sure how else to approach the problem.
Since you add the tag of dplyr and purrr, I assume you are interested in a tidyverse solution. So below I will demonstrate a solution based on the tidyverse.
First, your range_stats is problematic. This is why you got the error message. The weighted.mean is expecting a vector for both the x and w argument. However, if rangedf is a tibble, the way you subset the tibble, such as rangedf[,lat] will still return a one-column tibble. A better way is to use pull from the dplyr package.
library(tidyverse)
range_stats <- function(rangedf, lat, lon, weighting, na.rm=T){
cent_lat <- weighted.mean(x = rangedf %>% pull(lat),
w = rangedf %>% pull(weighting), na.rm=T)
cent_lon <- weighted.mean(x = rangedf %>% pull(lon),
w = rangedf %>% pull(weighting), na.rm=T)
out <- data.frame(cent_lat, cent_lon)
return(out)
}
Next, the way you created the data frame is OK, but data.table is from the data.table package and you will create a data.table, not a tibble. I thought you want to use an approach from tidyverse, so I changed data.table to data_frame as follows.
LATITUDE <- c(27.91977, 21.29066, 26.06340, 28.38918, 25.97517, 27.96313)
LONGITUDE <- c(-175.8617, -157.8645, -173.9593, -178.3571, -173.9679, -175.7837)
BIOMASS <- c(4.3540488, 0.2406332, 0.2406332, 2.1419699, 0.3451426, 1.0946017)
SPECIES <- c('Abudefduf abdominalis','Abudefduf abdominalis','Abudefduf abdominalis','Chaetodon lunulatus','Chaetodon lunulatus','Chaetodon lunulatus')
YEAR <- c('2005', '2005', '2014', '2009', '2009', '2015')
testdf <- data_frame(LATITUDE, LONGITUDE, BIOMASS, SPECIES, YEAR)
Now, you said you want to apply the range_stats function to each combination of SPECIES and YEAR. One approach is to split the data frame to a list of data frames, and use lapply family function. But here I want to show you how to use the map family function to achieve this task as map is from the purrr package, which is part of the tidyverse.
We can first create a group indices based on SPECIES and YEAR.
testdf2 <- testdf %>%
mutate(Group = group_indices(., SPECIES, YEAR))
testdf2
# A tibble: 6 x 6
LATITUDE LONGITUDE BIOMASS SPECIES YEAR Group
<dbl> <dbl> <dbl> <chr> <chr> <int>
1 27.91977 -175.8617 4.3540488 Abudefduf abdominalis 2005 1
2 21.29066 -157.8645 0.2406332 Abudefduf abdominalis 2005 1
3 26.06340 -173.9593 0.2406332 Abudefduf abdominalis 2014 2
4 28.38918 -178.3571 2.1419699 Chaetodon lunulatus 2009 3
5 25.97517 -173.9679 0.3451426 Chaetodon lunulatus 2009 3
6 27.96313 -175.7837 1.0946017 Chaetodon lunulatus 2015 4
As you can see, Group is a new column showing the index number. Now we can split the data frame based on Group, and then use map_dfr to apply the range_stats function.
testresult <- testdf2 %>%
split(.$Group) %>%
map_dfr(range_stats, lat = "LATITUDE",lon = "LONGITUDE",
weighting = "BIOMASS", na.rm = TRUE, .id = "Group")
testresult
Group cent_lat cent_lon
1 1 27.57259 -174.9191
2 2 26.06340 -173.9593
3 3 28.05418 -177.7480
4 4 27.96313 -175.7837
Notice that map_dfr can automatic bind the output list of data frames to a single data frame. .id = "Group" means we want to create a column called Group based on the name of the list element.
I separated the process into two steps, but of course they can be all in one pipeline as follows.
testresult <- testdf %>%
mutate(Group = group_indices(., SPECIES, YEAR)) %>%
split(.$Group) %>%
map_dfr(range_stats, lat = "LATITUDE",lon = "LONGITUDE",
weighting = "BIOMASS", na.rm = TRUE, .id = "Group")
If you want, testresult can be merged with testdf using left_join, but I will stop here as testresult is probably already the desired output you want. I hope this helps.
Fundamentally, the main issue involves weighted.mean() where you are passing a dataframe object and not a vector that can be coerced to double. To fix within method, simply change:
x=rangedf[,lat]
To double brackets:
x=rangedf[[lat]]
Adjusted method:
range_stats <- function(rangedf, lat, lon, weighting, na.rm=T){
cent_lat <- weighted.mean(x=rangedf[[lat]], w=rangedf[[weighting]], na.rm=T)
cent_lon <- weighted.mean(x=rangedf[[lon]], w=rangedf[[weighting]], na.rm=T)
out <- data.frame(cent_lat, cent_lon)
return(out)
}
As for overall group by slice computation, do forgive me in bypassing, dplyr and data.table which you use and consider base R's underutilized but useful method, by().
The challenge with your current setup is the output of range_stats method return is a data.frame of two columns and dplyr's group_by() expects one aggregation vector operation. However, by passes dataframe objects (sliced by factors) into a defined function to return a list of data.frames which you can then rbind for one final dataframe:
df_List <- by(testdf, testdf[, c("SPECIES", "YEAR")], FUN=function(df)
data.frame(species=df$SPECIES[1],
year=df$YEAR[1],
range_stats(df,"LATITUDE","LONGITUDE","BIOMASS"))
)
finaldf <- do.call(rbind, df_List)
finaldf
# species year cent_lat cent_lon
# 1 Abudefduf abdominalis 2005 27.57259 -174.9191
# 2 Chaetodon lunulatus 2009 28.05418 -177.7480
# 3 Abudefduf abdominalis 2014 26.06340 -173.9593
# 4 Chaetodon lunulatus 2015 27.96313 -175.7837

function applied to summarise + group_by doesn't work correctly

I extract my data
fluo <- read.csv("data/ctd_SOMLIT.csv", sep=";", stringsAsFactors=FALSE)
I display in three columns : the day, the month and the year based on the original date : Y - m - d
fluo$day <- day(as.POSIXlt(fluo$DATE, format = "%Y-%m-%d"))
fluo$month <- month(as.POSIXlt(fluo$DATE, format = "%Y-%m-%d"))
fluo$year <- year(as.POSIXlt(fluo$DATE, format = "%Y-%m-%d"))
This is a part of my data_frame:
Then, I do summarise and group_by in order to apply the function :
prof_DCM = fluo[max(fluo$FLUORESCENCE..Fluorescence.),2]
=> I want the depth of the max of FLUORESCENCE measured for each month, for each year.
mean_fluo <- summarise(group_by(fluo, month, year),
prof_DCM = fluo[max(fluo$FLUORESCENCE..Fluorescence.),2])
mean_fluo <- arrange(mean_fluo, year, month)
View(mean_fluo)
But it's not working ...
The values of prof_DCM still the same all along the column 3 of the data_frame:
Maybe try the following code.
library(dplyr)
mean_fluo <- fluo %>%
group_by(month,year) %>%
filter(FLUORESCENCE..Fluorescence. == max(FLUORESCENCE..Fluorescence.)) %>%
arrange(year,month)
View(mean_fluo)
You can select the variables you want to keep with 'select'
mean_fluo <- fluo %>%
group_by(month,year) %>%
filter(FLUORESCENCE..Fluorescence. == max(FLUORESCENCE..Fluorescence.)) %>%
arrange(year,month)%>%
select(c(month,year,PROFONDEUR))

Resources