sort data into deciles based on a rolling subset - r

I am trying to replicate the Fama French 1993 paper using R. I need to do the following sorting :
for each month,
calculate ME decile breakpoints on NYSE stocks only
sort all stocks into the deciles created in 2.
Data generation:
set.seed(1234)
n = 120
stocks <- c("A", "B", "C", "D", "E")
exchange <- c("NYSE", "NASDAQ", "AMEX")
df <- as.data.frame(cbind(Month = 1:12,
exchangeCode = exchange[round(runif(n, 1, 3))],
Stock = stocks[round(runif(n, 1, 5))],
ME=floor(100*abs(rnorm(n)))))
Desired Output:
ME_NYSE_vals <- as.numeric(paste(df[df$Month==1 & df$exchangeCode=="NYSE","ME"]))
ME_ALL_vals <- as.numeric(paste(df[df$Month==1,"ME"]))
cut(x = ME_ALL_vals,
breaks = c(-Inf,quantile(ME_NYSE_vals,probs=seq(.1,.9,.1)),+Inf),
labels = 1:10
)
The breaks should be calculated based on ME_NSYE_vals. The cut should be applied to all ME_ALL_vals for each month.

If the intention is to keep the whole data frame but generate deciles only for the NYSE values the code below could do. The point was to generate deciles only for the entries pertaining to the NYSE values but to keep the full data set achieving some form of a partial sorting.
# Libs
Vectorize(require)(package = c("dplyr", "magrittr"),
character.only = TRUE)
# Transformations
df %<>%
mutate(nTileNYSE = ifelse(exchangeCode == "NYSE", ntile(ME, 10), NA))
arrange(nTileNYSE)
The code was applied to the data:
set.seed(1)
df <- as.data.frame(cbind(exchangeCode = c("NYSE", "NASDAQ"),
Stock = c("A", "B", "C", "A"),
Month = 1:12,
ME=rnorm(1200)))
2nd approach
Following the discussion in the comments I would suggest the following approach:
# Libs --------------------------------------------------------------------
Vectorize(require)(package = c( "tidyr", "dplyr", "magrittr", "xts", "Hmisc"),
char = TRUE)
# Data generation ---------------------------------------------------------
set.seed(1234)
n = 120
stocks <- c("A", "B", "C", "D", "E")
exchange <- c("NYSE", "NASDAQ", "AMEX")
df <- as.data.frame(cbind(Month = 1:12,
exchangeCode = exchange[round(runif(n, 1, 3))],
Stock = stocks[round(runif(n, 1, 5))],
ME = floor(100*abs(rnorm(n)))))
# Transformations ---------------------------------------------------------
# For some reason this was needed
df$ME <- as.numeric(as.character(df$ME))
# Generate cuts
dfNtiles <- df %>%
arrange(exchangeCode, Month, ME) %>%
group_by(exchangeCode, Month) %>%
mutate(cutsBsdOnNYSE = cut(x = ME,
breaks = cut2(x = df$ME[df$exchangeCode == "NYSE"],
g = 10, onlycuts = TRUE))) %>%
ungroup() %>%
group_by(cutsBsdOnNYSE) %>%
mutate(grpBsdOnNYSE = n())
It's fairly straightforward
Generating cut brackets reflecting subset of the data.
Applying those brackets to the whole vector (ME)
Numbering the obtained groups so a group identifier is created
and boils down to:

Related

Is there any way to plot labels in ggplot using two different data sets with different number of columns and rows?

I'm trying to plot and metaMDS with the labels (spp) of the distance data.
dat<-data.frame(
"site" = c("a", "a", "a",'a', "b", "b", "b", 'b', "c", "c", "c",'c'),
"sample"=c(rep(1,4),rep(2,4),rep(3,4)),
"sp1"=c(1,3,3,2,4,2,1,5,3,6,1,5),
"sp2"=c(1,3,3,2,4,2,1,5,3,6,1,5),
"sp3"=c(1,3,3,2,4,2,1,5,3,6,1,5),
"sp4"=c(2,3,4,1,5,3,1,5,5,8,9,1),
"sp5"=c(3,4,3,1,6,7,5,8,3,1,3,2)
)
I tried
```
dat1<-dat[,3:7]
dmds<-metaMDS(dat1, distance = "bray", autotransform = FALSE)
mds1 <- dmds$points[,1]
mds2 <- dmds$points[,2]
plt<-cbind(dat,mds1,mds2)
spp<-names(dat1)
ggplot(plt,aes(mds1,mds2, shape=site,color=site))+
geom_point()+
geom_text(aes(label=spp), size=3)
```
However, a get an error
Error: Aesthetics must be either length 1 or the same as the data (12): label
Because the number of rows of dat and rows of "spp<-names(dat1)" are different
Is there any way to workaround this error?
I would like to have something like this, but with the correct spp number
As a further potential direction - I think you mean that you'll have a separate set of species for each site, rather than the five (sp1:sp5) identical species labelled in each site? Here's a worked example of what that might look like, passing three dataframes to allow for different columns representing three different sets of species:
library(tidyverse)
library(vegan)
dat_a<-data.frame(
"site" = "a",
sample = c(1:4),
"sp1_a"=c(1,3,3,2),
"sp2_a"=c(1,3,3,2),
"sp3_a"=c(1,3,3,2),
"sp4_a"=c(2,3,4,1),
"sp5_a"=c(3,4,3,1)
)
dat_b<-data.frame(
"site" = "b",
sample = c(1:4),
"sp1_b"=c(4,2,1,5),
"sp2_b"=c(4,2,1,5),
"sp3_b"=c(4,2,1,5),
"sp4_b"=c(5,3,1,5),
"sp5_b"=c(6,7,5,8)
)
dat_c<-data.frame(
"site" = "c",
sample = c(1:4),
"sp1_c"=c(3,6,1,5),
"sp2_c"=c(3,6,1,5),
"sp3_c"=c(3,6,1,5),
"sp4_c"=c(5,8,9,1),
"sp5_c"=c(3,1,3,2)
)
# Using purrr::map to iterate over lists of dataframes
results <-
map(list(dat_a, dat_b, dat_c), function(x)
metaMDS(x[, 3:7], , distance = "bray", autotransform = FALSE))
results %>%
map2(c("a", "b", "c"), ~ tibble(
site = .y,
species = rownames(.x$species),
x = .x$species[,1],
y = .x$species[,2]
)) %>%
bind_rows() %>%
ggplot(aes(x, y, label = species, col = site)) +
geom_text()
Created on 2021-05-08 by the reprex package (v2.0.0)

Plotting Number of Times Value Appears in Two Dataframes in R

I have two sets of data. Each contains a column for the name of the molecule and a column for the number of times that molecule appears in the sample. I want to create a scatterplot with the number of times a molecule appears in dataset #1 on the x-axis and how many times it appears in dataset #2. If a molecule is in one dataset and not the other, it appears 0 times.
Example:
dat1 <- data.frame(
name = c("A", "B", "D", "E")
count = c(10, 1, 30, 10)
)
dat2 <- data.frame(
name = c("A", "B", "C", "F")
count = c(1, 3, 50, 40)
)
Point #1 would be (10,1) corresponding to A, Point #2 would be (1,3), Point #3 would be (0,50) and so on. I don't want to label my points since my datasets contain tens of thousands of molecules.
Try joining the data.frames
full_join(dat1, dat2, by="name") %>%
mutate_all(function(xx) ifelse(is.na(xx), 0, xx)) %>%
ggplot(aes(count.x, count.y)) +
geom_point()
which produces
You would need a full_join():
library(dplyr)
library(ggplot2)
#Data
dat1 <- data.frame(
name = c("A", "B", "D", "E"),
count = c(10, 1, 30, 10)
)
dat2 <- data.frame(
name = c("A", "B", "C", "F"),
count = c(1, 3, 50, 40)
)
#Code
dat1 %>% full_join(dat2 %>% rename(count2=count)) %>%
replace(is.na(.),0) %>%
ggplot(aes(x=count,y=count2))+
geom_point()+
geom_text(aes(label=name),vjust=-0.5)
Output:

Calculating a rolling return

I have a data frame with 3 columns. What I want to do is to calculate the product of the return over a selected month rolling period for each monthly period (or said another way, each row) (where available). This is the basic structure of the data.
set.seed = 100
assets <- c("A", "B", "C", "D", "E", "F", "G", "H", "I")
FileDate <- seq(as.Date("2011-12-30"), as.Date("2019-01-31"), by="months")
df <- merge(x = assets, y = FileDate, all.x = TRUE)
df$return <- runif(774, min=0, max=1)
What it should end with is a dataframe where a new column is added with the selected period cumulative return for that time frame. For example, I have shown below a four month return. The calculation of the 4-month return on 03/30/2012 from the data would be:
((1+0.81/100)(1+0.715/100)(1+0.27/100)*(1+0.80/100)-1)*100
This would be repeated for each value under the X column.
I ended up utilizing the mutate function there you can set the lag width. in the end version I wanted
library(dplyr)
library(zoo)
# Create Test Dataframe
set.seed = 100
assets <- c("A", "B", "C", "D", "E", "F", "G", "H", "I")
FileDate <- seq(as.Date("2011-12-30"), as.Date("2019-01-31"), by="months")
df <- merge(x = assets, y = FileDate, all.x = TRUE)
df$performance <- runif(774, min=0, max=1)
This particular code creates a 5 month average on a rolling basis. If you sort by column X you can see and recreate it in excel.
df <- df %>%
group_by(x) %>%
mutate(x_mean = rollmean(performance, 5, fill = NA, align = 'right'))
I also found a way to create a lag so I could take the 4 prior values to the observation and calculate the mean:
df2 = df %>%
mutate(perf.4.previous = rollapply(data = perf.1.previous, width = 4, FUN =
mean, align = "right", fill = NA, na.rm = T))

Creating duplicate in R

I have the following input data frame with 4 columns and 3 rows.
The time column can take value from 1 to the corresponding value of the maturity column for that customer, I want to create more observations for each customer till the value of time is = value of maturity, with the other columns retaining their original value. Please see the below links for input and expected output
Input
Output
Here is a dplyr solution inspired but not exactly equal to this post.
library(dplyr)
df <- data.frame(custno = 1:3, time = 1, dept = c("A", "B", "A"))
df %>%
slice(rep(1:n(), each = 5)) %>%
group_by(custno) %>%
mutate(time = seq_along(time))
Edit
After the comments by the OP, the following seems to be better.
First, the data:
df <- data.frame(custno = 1:3, time = 1,
dept = c("A", "B", "A"),
maturity = c(5,4,6))
And the solution.
df %>%
tidyr::uncount(maturity) %>%
group_by(custno) %>%
mutate(time = seq_along(time))
We can also use slice with row_number
library(dplyr)
library(data.table)
df %>%
slice(rep(row_number(), maturity)) %>%
mutate(time = rowid(custno))
data
df <- data.frame(custno = 1:3, time = 1,
dept = c("A", "B", "A"),
maturity = c(5,4,6))

Why does eval(parse(text=string)) give shorter output for object of class aov?

Why does this...
B.aov2<- eval(parse(text=StringforEvaluation))
summary(B.aov2)
produce a less detailed report than this...?
res.aov2 <- aov(DV ~ IV1*IV2+Error(Participant/(IV1*IV2), data = AnovaAnalysisData))
summary(res.aov2)
Reproducible data, per request of a user. I know it's weird, but I'm making a user-friendly stats program for my students. Now that I've posted the "reproducible code" it wants me to provide more data. Now, I didn't originally post this code because I thought it was a bit much and my general coding sucks.. My question still stand.. Why does one give me a short crappy, unhelpful output, and the other one provide exactly what I want...I ran the same command:
library(tidyverse)
DV <- c(1,1, 5,6, 1, 2, 7, 7, 1, 4, 9, 9)
IV1 <- c("A","B", "A", "B","A","B", "A", "B","A","B", "A", "B" )
IV2 <- c("C","C","D","D", "C","C", "D","D", "C","C", "D","D")
Participant <- c("A", "A", "A", "A", "B","B","B","B","C","C","C","C")
IV3 <- "no_data" #remove the word "no_data" and add c("A","B", "C", etc.. )
IV4 <- "no_data" #remove the word "no_data" and add c("A","B", "C", etc.. )
##### You have to tell the computer if the variable is within!
IV1_iswithin <-"Y"
IV2_iswithin <-"Y"
IV3_iswithin <-"N"
IV4_iswithin <-"N"
####### Your JOB is DONE
data <- data.frame(DV,Participant,IV1,IV2)
#Grouping the dataframe
data %>%group_by(IV2, IV1)%>% #subsetting the data set to calculate 4 different stats
mutate(MAD = median(abs(DV-median(DV))*2.5*1.4826))%>% #calculates 4 differnet mad numbers
mutate(MADLL = median(DV)-MAD)%>% #cacluates UL of MAD, pipe output to next command
mutate(MADUL = median(DV)+MAD)%>% #calculates the LL of MAD
mutate(OutlierPresent = ifelse(DV<MADLL | DV>MADUL, NA, DV))%>% #Creates NA values if it is an outlier
ungroup()%>% #converts back to big data set
mutate(OutlierPresent = ifelse(DV<MADLL | DV>MADUL, NA, DV))%>% #Creates NA
mutate(whichgroup <- paste(IV1,IV2))%>%
mutate(observation = 1:n()) %>%
{. ->> b }
b %>%
select(DV,OutlierPresent) %>%
{. ->> outlierfeedback }
formattable(outlierfeedback)
b %>%
select(OutlierPresent,IV1,IV2,Participant) %>%
pivot_wider(names_from = c(IV1,IV2), values_from = OutlierPresent, names_sep = "_", id_cols = Participant)%>%
drop_na()%>%
pivot_longer(-Participant, names_to = c("IV1","IV2"), names_sep = "_", values_to = "DV")%>%
{. ->> AnovaAnalysisData }
### Calculating the ANOVA for our outlier free data AnovaAnalysisData
#SettingUpTheListofFixedFactors
FactorModel <- list()
ifelse(length(IV1)> 1, FactorModel<- c(FactorModel, "IV1"), FactorModel<-FactorModel)
ifelse(length(IV2)> 1, FactorModel<- c(FactorModel, "IV2"), FactorModel<-FactorModel)
ifelse(length(IV3)> 1, FactorModel<- c(FactorModel, "IV3"), FactorModel<-FactorModel)
ifelse(length(IV4)> 1, FactorModel<- c(FactorModel, "IV4"), FactorModel<-FactorModel)
#SettingUPTheListofErrorFactors
ErrorModel <-list()
ErrorModel <- list()
ifelse(IV1_iswithin== "Y", ErrorModel<- c(ErrorModel, "IV1"), ErrorModel<-ErrorModel)
ifelse(IV2_iswithin== "Y", ErrorModel<- c(ErrorModel, "IV2"), ErrorModel<-ErrorModel)
ifelse(IV3_iswithin== "Y",ErrorModel<- c(ErrorModel, "IV3"), ErrorModel<-ErrorModel)
ifelse(IV4_iswithin== "Y", ErrorModel<- c(ErrorModel, "IV4"), ErrorModel<-ErrorModel)
StrStart = "aov(DV ~"
StrFactor<-paste(ErrorModel, collapse='*' )
StrErrorStart<-("+Error(Participant/(")
StrError<-paste(ErrorModel, collapse='*' )
StrErrorEnd<- ("),data=AnovaAnalysisData))")
StringforEvaluation<- paste(StrStart, StrFactor,StrErrorStart,StrError,StrErrorEnd)
B.aov2<- eval(parse(text=StringforEvaluation))
summary(B.aov2)
res.aov2 <- aov(DV ~ IV1*IV2+Error(Participant/(IV1*IV2), data = AnovaAnalysisData))
summary(res.aov2)

Resources