Here is the test I'm interested in:
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
How can I adapt this code into a function that accepts a vector of numeric values and returns a logical vector specifying which data points to remove?
I have attempted to do this below, but I'm getting stuck because when I sort the vector to return, it doesn't line up with the input vector data.
# input data
y = c(-0.25, 0.68, 0.94, 1.15, 1.20, 1.26, 1.26,
1.34, 1.38, 1.43, 1.49, 1.49, 1.55, 1.56,
1.58, 1.65, 1.69, 1.70, 1.76, 1.77, 1.81,
1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.10,
2.14, 2.15, 2.23, 2.24, 2.26, 2.35, 2.37,
2.40, 2.47, 2.54, 2.62, 2.64, 2.90, 2.92,
2.92, 2.93, 3.21, 3.26, 3.30, 3.59, 3.68,
4.30, 4.64, 5.34, 5.42, 6.01)
## Generate normal probability plot.
qqnorm(y)
removeoutliers = function(dfinputcol) {
y = as.vector(dfinputcol)
## Create function to compute the test statistic.
rval = function(y){
ares = abs(y - mean(y))/sd(y)
df = data.frame(y, ares)
r = max(df$ares)
list(r, df)}
## Define values and vectors.
n = length(y)
alpha = 0.05
lam = c(1:10)
R = c(1:10)
## Compute test statistic until r=10 values have been
## removed from the sample.
for (i in 1:10){
if(i==1){
rt = rval(y)
R[i] = unlist(rt[1])
df = data.frame(rt[2])
newdf = df[df$ares!=max(df$ares),]}
else if(i!=1){
rt = rval(newdf$y)
R[i] = unlist(rt[1])
df = data.frame(rt[2])
newdf = df[df$ares!=max(df$ares),]}
## Compute critical value.
p = 1 - alpha/(2*(n-i+1))
t = qt(p,(n-i-1))
lam[i] = t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1))
}
## Print results.
newdf = data.frame(c(1:10),R,lam)
names(newdf)=c("Outliers","TestStat.", "CriticalVal.")
# determine how many outliers to remove
toremove = max(newdf$Outliers[newdf$TestStat. > newdf$CriticalVal.])
# create vector of same size as input vector
logicalvectorTifshouldremove = logical(length=length(y))
# but how to determine which outliers to remove?
# set largest data points as outliers to remove.. but could be the smallest in some data sets..
logicalvectorTifshouldremove = replace(logicalvectorTifshouldremove, tail(sort(y), toremove), TRUE)
return (logicalvectorTifshouldremove)
}
# this should have 3 data points set to TRUE .. but it has 2 and they aren't the correct ones
output = removeoutliers(y)
length(output[output==T])
I think it is written in the page you gave (not exactly but in two sentences):
Remove the r observations that maximizes |x_i - mean(x)|
So you get the data without the outliers simply by removing the r ones which fall over the difference, using:
y[abs(y-mean(y)) < sort(abs(y-mean(y)),decreasing=TRUE)[toremove]]
You do not need the last two lines of your code. By the way, you can compute directly:
toremove = which(newdf$TestStat > newdf$CriticalVal)
To simplify a bit, the final function could be:
# Compute the critical value for ESD Test
esd.critical <- function(alpha, n, i) {
p = 1 - alpha/(2*(n-i+1))
t = qt(p,(n-i-1))
return(t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1)))
}
removeoutliers = function(y) {
## Define values and vectors.
y2 = y
n = length(y)
alpha = 0.05
toremove = 0
## Compute test statistic until r=10 values have been
## removed from the sample.
for (i in 1:10){
if(sd(y2)==0) break
ares = abs(y2 - mean(y2))/sd(y2)
Ri = max(ares)
y2 = y2[ares!=Ri]
## Compute critical value.
if(Ri>esd.critical(alpha,n,i))
toremove = i
}
# Values to keep
if(toremove>0)
y = y[abs(y-mean(y)) < sort(abs(y-mean(y)),decreasing=TRUE)[toremove]]
return (y)
}
which returns:
> removeoutliers(y)
[1] -0.25 0.68 0.94 1.15 1.20 1.26 1.26 1.34 1.38 1.43 1.49
[12] 1.49 1.55 1.56 1.58 1.65 1.69 1.70 1.76 1.77 1.81 1.91
[23] 1.94 1.96 1.99 2.06 2.09 2.10 2.14 2.15 2.23 2.24 2.26
[34] 2.35 2.37 2.40 2.47 2.54 2.62 2.64 2.90 2.92 2.92 2.93
[45] 3.21 3.26 3.30 3.59 3.68 4.30 4.64
You can use winsorize on library robustHD
library('robustHD')
set.seed(1234)
x <- rnorm(10)
x[1] <- x[1] * 10
x[2] <- x[2] * 11
x[10] <- x[10] * 10
x
[1] -12.0706575 3.0517217 1.0844412 -2.3456977 0.4291247 0.5060559 -0.5747400 -0.5466319 -0.5644520 -8.9003783
boxplot(x)
y <- winsorize(x)
y
[1] -4.5609058 3.0517217 1.0844412 -2.3456977 0.4291247 0.5060559 -0.5747400 -0.5466319 -0.5644520 -4.5609058
boxplot(y)
so if you have dataframe or vector you can use sapply to perform winsorize function.
For more information about this library you can follow this link http://cran.r-project.org/web/packages/robustHD/index.html
Related
I am rather new to R and really could need the help of the community with the following problem. I am trying to solve for the variable r in the following equation: (EPS2 + r*DPS1-EPS1)/r^2)-PRC. Here is my (unsuccessful) attempt on solving the problem (using the uniroot function):
EPS2 = df_final$EPS2
DPS1 = df_final$DPS1
EPS1 = df_final$EPS1
PRC = df_final$PRC
f1 = function(r) {
((df_final_test$EPS2 + r * df_final_test$DPS1-df_final_test$EPS1)/r^2)-df_final_test$PRC
}
uniroot(f1,interval = c(1e-8,100000),EPS2, DPS1, EPS1, PRC , extendInt="downX")$root
I then get the following error: Error in f(lower, ...) : unused
arguments (c(" 1.39", " 1.39", ...
I am grateful for any tips and hints you guys could give me in regard to this problem. Or whether a different function/package would be better in this case.
Added a reprex (?) in case that helps anybody in helping me with this issue:
df <- structure(list(EPS1 = c(6.53, 1.32, 1.39, 1.71, 2.13), DPS1 = c(2.53, 0.63,
0.81, 1.08, 1.33, 19.8), EPS2 = c(7.57,1.39,1.43,1.85,2.49), PRC = c(19.01,38.27,44.82,35.27,47.12)), .Names = c("EPS1", "DPS1", "EPS2", "PRC"), row.names = c(NA,
-5L), class = "data.frame")
I don't think you can use uniroot if all coefficients are vectors rather than scalars. In this case, a straightforward approach is solving them in a math way, i.e.,
r1 <- (DPS1 + sqrt(DPS1^2-4*PRC*(EPS1-EPS2)))/(2*PRC)
and
r2 <- (DPS1 - sqrt(DPS1^2-4*PRC*(EPS1-EPS2)))/(2*PRC)
where r1 and r2 are two roots.
Disclaimer: I have no experience with uniroot() and have not idea if the following makes sense, but it runs! The idea was to basically call uniroot for each row of the data frame.
Note that I modified the function f1 slightly so each of the additional parameters has are to be passed as arguments of the function and do not rely on finding the objects with the same name in the parent environment. I also use with to avoid calling df$... for every variable.
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.1.0
library(furrr)
#> Loading required package: future
df <- structure(list(EPS1 = c(6.53, 1.32, 1.39, 1.71, 2.13),
DPS1 = c(2.53, 0.63, 0.81, 1.08, 1.33, 19.8),
EPS2 = c(7.57,1.39,1.43,1.85,2.49),
PRC = c(19.01,38.27,44.82,35.27,47.12)),
.Names = c("EPS1", "DPS1", "EPS2", "PRC"),
row.names = c(NA,-5L), class = "data.frame")
df
#> Warning in format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x, :
#> corrupt data frame: columns will be truncated or padded with NAs
#> EPS1 DPS1 EPS2 PRC
#> 1 6.53 2.53 7.57 19.01
#> 2 1.32 0.63 1.39 38.27
#> 3 1.39 0.81 1.43 44.82
#> 4 1.71 1.08 1.85 35.27
#> 5 2.13 1.33 2.49 47.12
f1 = function(r, EPS2, DPS1, EPS1, PRC) {
(( EPS2 + r * DPS1 - EPS1)/r^2) - PRC
}
# try for first row
with(df,
uniroot(f1,
EPS2=EPS2[1], DPS1=DPS1[1], EPS1=EPS1[1], PRC=PRC[1],
interval = c(1e-8,100000),
extendInt="downX")$root)
#> [1] 0.3097291
# it runs!
# loop over each row
vec_sols <- rep(NA, nrow(df))
for (i in seq_along(1:nrow(df))) {
sol <- with(df, uniroot(f1,
EPS2=EPS2[i], DPS1=DPS1[i], EPS1=EPS1[i], PRC=PRC[i],
interval = c(1e-8,100000),
extendInt="downX")$root)
vec_sols[i] <- sol
}
vec_sols
#> [1] 0.30972906 0.05177443 0.04022946 0.08015686 0.10265226
# Alternatively, you can use furrr's future_map_dbl to use multiple cores.
# the following will basically do the same as the above loop.
# here with 4 cores.
plan(multisession, workers = 4)
vec_sols <- 1:nrow(df) %>% furrr::future_map_dbl(
.f = ~with(df,
uniroot(f1,
EPS2=EPS2[.x], DPS1=DPS1[.x], EPS1=EPS1[.x], PRC=PRC[.x],
interval = c(1e-8,100000),
extendInt="downX")$root
))
vec_sols
#> [1] 0.30972906 0.05177443 0.04022946 0.08015686 0.10265226
# then apply the solutions back to the dataframe (each row to each solution)
df %>% mutate(
root = vec_sols
)
#> Warning in format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x, :
#> corrupt data frame: columns will be truncated or padded with NAs
#> EPS1 DPS1 EPS2 PRC root
#> 1 6.53 2.53 7.57 19.01 0.30972906
#> 2 1.32 0.63 1.39 38.27 0.05177443
#> 3 1.39 0.81 1.43 44.82 0.04022946
#> 4 1.71 1.08 1.85 35.27 0.08015686
#> 5 2.13 1.33 2.49 47.12 0.10265226
Created on 2021-06-20 by the reprex package (v2.0.0)
I am trying to do the panel regression, where dependent variable (stock returns for various companies) is regressed on 5 independent variables.
Here is the reproductible example of a data frame of independent variables
dput(factors_1[1:10,])
structure(list(Date = 200002:200011, Mkt.RF = c(5.94, 0.66, -5.58,
-0.09, 0.67, -1.58, -1.61, -4.99, -2.71, -4.55), SMB = c(0.84,
-5.15, -4.62, 0.16, 0.33, -0.69, 0.68, 2.35, -6.1, -0.78), HML = c(-9.45,
3.33, 5.93, 6.17, 3.14, 3.31, -0.5, 2.64, 7.54, 11.15), RMW = c(3.55,
-2.59, -1.53, -3.38, -3.45, -0.12, -1.27, 1.63, 2.7, 0.79), CMA = c(-7.33,
4.96, 1.32, 4.94, 1.22, -0.12, 0.64, 2.16, 4.1, 8.75), RF = c(0.43,
0.47, 0.46, 0.5, 0.4, 0.48, 0.5, 0.51, 0.56, 0.51)), row.names = c(NA,
10L), class = "data.frame")
and here for the stock returns
dput(xx[1:10, 1:10])
structure(list(Date = structure(c(10990.9954886386, 11019.9953776753,
11050.9954014418, 11080.9952984982, 11111.9953776753, 11141.9951640545,
11172.995061378, 11203.9951324494, 11233.9950455918, 11264.9949982497
), class = "Date"), X1 = c(0.0954887438827963,
-0.0596463008222008, 0.071350885788402, 0.0305926490738153, 0.0408331711459304,
-0.0211402933162625, -0.00493862203119688, 0.006182173191563,
0.0032423131269943, 0.0193884936176278), X2 = c(-0.123462974283698,
0.230503533400868, -0.0272942506612435, 0.0483790669291113, -0.0595278152717571,
0.12087834022411, -0.032011380068422, -0.0813892896957779, 0.0138779835292666,
0.0726322608057619), X3 = c(-0.0682052985267971, 0.172249290323711,
-0.154888201350603, 0.0395159403332963, -0.0143942598523314,
-0.0607566985291722, -0.0310708779173386, -0.0746345858888015,
-0.151109426840925, 0.0118888362760825), X4 = c(-0.114511361380472,
0.00998441685033158, 0.192522150537581, -0.0158023343537101,
0.0374730915541921, 0.0777493327863055, -0.0016218724457906,
-0.0635452365157563, 0.0565030106039299, 0.115759209185826),
X5 = c(0.00389199996406542, -0.0212889913893688,
0.164892967212694, -0.00832469019706505, -0.00462232472270219,
-0.0070177636719938, 0.00453659662769512, 0.0528941822866427,
-0.00836737746775751, -0.0050017502848112), X6 = c(-0.10351479457366,
0.0237125822002096, 0.0101860439504515, 0.0111924296807739,
-0.0652473862813747, 2.11404059631271e-05, 0.0261396151198399,
-0.0356789492292369, -0.0706069184275196, -0.0656535040135704
), X7 = c(-0.0980023956049211, 0.102552120231041,
-0.0959174074104425, -0.0790740833989735, 0.118610740782993,
-0.100050822390369, -0.00333557692764708, -0.0368703292701125,
0.0628135821343774, 0.0471186471744018), X9 = c(-0.0304322345046196,
-0.0977595796246631, 0.138258584646108, 0.0344876873979214,
-0.000721154371596811, 0.0508635363751093, 0.0533435865577603,
-0.0506646520146184, 0.0497235991059199, 0.0284083879640369
), X9 = c(-0.159712703662352, -0.0234902492510041, 0.116858931667507,
0.00432376896685471, 0.114340108193219, 0.00235829911414087,
-0.0573195744121493, 0.095634961997471, -0.0871461890063988,
-0.0738243041819919)), row.names = c(NA, 10L), class = "data.frame")
What I tried:
p1_q1_l<-plm(as.matrix(data.frame(xx[, -1]))~factors_1$Mkt.RF+factors_1$SMB+factors_1$HML+factors_1$RMW+factors_1$CMA,data=factors_1, method="within")
And what I got
Error in tapply(x, effect, func, ...) : arguments must have same length
I dont understand what is going on. Both tables are data frames with the same number of observations. How can I fix this?
It is very likely that the error arises from the fact that you define a matrix as your independent (Y) variable, where a vector is needed. You need the data in long format, where your Y is one column, and an ID and a time column denotes the different observations.
I have some doubts about the compatibility of your two data sets, though, but you may want to merge them into one. Just carefully look, how you may merge your original data, particularly regarding the Date columns.
When I understand your xxx data right, the X* are the different firms. Now convert the data in long format, using reshape.
xxx <- reshape(xx, timevar=1, varying=2:9, direction="long", sep="")
xxx$Date <- as.character(xx$Date[xxx$Date])
Then, it might be easier to merge the two data sets into one. The "Date" columns of both data frames, however, don' t match. When I understand your factors_1 data right, they are monthly values. I'll continue by simply attaching a "03" to the second to get them to match for now, but you know what's actually needed.
factors_1x <- transform(factors_1,
Date=as.character(as.Date(strptime(paste(factors_1$Date, 03),
"%Y%m%d"))))
Here merge.
dat <- merge(xxx, factors_1x, all.x=TRUE)
head(dat)
# Date X id Mkt.RF SMB HML RMW CMA RF
# 1 2000-02-03 0.09548874 1 5.94 0.84 -9.45 3.55 -7.33 0.43
# 2 2000-02-03 -0.05964630 2 5.94 0.84 -9.45 3.55 -7.33 0.43
# 3 2000-02-03 0.07135089 3 5.94 0.84 -9.45 3.55 -7.33 0.43
# 4 2000-02-03 0.03059265 4 5.94 0.84 -9.45 3.55 -7.33 0.43
# 5 2000-02-03 0.04083317 5 5.94 0.84 -9.45 3.55 -7.33 0.43
# 6 2000-02-03 -0.02114029 6 5.94 0.84 -9.45 3.55 -7.33 0.43
Now it is easier to write the formula. The new indices may be formulated in the plm call using index=c("id", "Date").
library(plm)
p1_q1_l <- plm(X ~ Mkt.RF + SMB + HML + RMW + CMA + RF, method="within",
index=c("id", "Date"), data=dat)
# Model Formula: X ~ Mkt.RF + SMB + HML + RMW + CMA + RF
#
# Coefficients:
# Mkt.RF SMB HML RMW CMA RF
# 0.0042267 0.0054278 -0.0016806 0.0129446 0.0148160 -0.4194726
I have a dataset in this format:
structure(list(id = 1:4, var1_before = c(-0.16, -0.31, -0.26,
-0.77), var2_before = c(-0.7, -1.06, -0.51, -0.81), var3_before = c(2.47,
2.97, 2.91, 3.01), var4_before = c(-1.08, -1.22, -0.92, -1.16
), var5_before = c(0.54, 0.4, 0.46, 0.79), var1_after = c(-0.43,
-0.18, -0.59, 0.64), var2_after = c(-0.69, -0.38, -1.19, -0.77
), var3_after = c(2.97, 3.15, 3.35, 1.52), var4_after = c(-1.11,
-0.99, -1.26, -0.39), var5_after = c(1.22, 0.41, 1.01, 0.24)), class = "data.frame", row.names = c(NA,
-4L))
Every id is unique.
I would like to make two clusters:
First cluster for variables: var1_before, var2_before, var3_before, var4_before, var5_before
Second cluster for variables: var1_after, var2_after, var3_after, var4_after, var5_after
I used two-step cluster in spss for this.
How is it possible to make it in R?
This question is quite complex, this is how I'd approach the problem, hoping to help and maybe to start a discussion about it.
Note:
this is how I think I could approach the problem;
I do not know the two-step clustering, so I use a kmeans;
it's based on your data, but you can easily generalize it: I've made it dependent to your data because it's simpler to explain.
So, you create the first clustering with the before variables, then the value of the variable changes (after variables), and you want to see if the id are in the same cluster.
This leads me to think that you only need the first set of clusters (for the before variables), then see if the ids have changed: no need to do a second clustering, but only see if they've changed from the one cluster to another.
# first, you make your model of clustering, I'm using a simple kmeans
set.seed(1234)
model <- kmeans(df[,2:6],2)
# you put the clusters in the dataset
df$before_cluster <- model$cluster
Now the idea is to calculate the Euclidean distance from the ids with the new variables (after variables), to the centroids calculated on the before variabiles:
# for the first cluster
cl1 <- list()
for (i in 1:nrow(df)) {
cl1[[i]] <- dist(rbind(df[i,7:11], model$centers[1,] ))
}
cl1 <- do.call(rbind, cl1)
colnames(cl1) <- 'first'
# for the second cluster
cl2 <- list()
for (i in 1:nrow(df)) {
cl2[[i]] <- dist(rbind(df[i,7:11], model$centers[2,] ))
}
cl2 <- do.call(rbind, cl2)
colnames(cl2) <- 'second'
# put them together
df <- cbind(df, cl1, cl2)
Now the last part, you can define if one has changed the cluster, getting the smallest distance from the centroids (smallest --> it's the new cluster), fetching the "new" cluster.
df$new_cl <- ifelse(df$first < df$second, 1,2)
df
id var1_before var2_before var3_before var4_before var5_before var1_after var2_after var3_after var4_after var5_after first second before_cluster first second new_cl
1 1 -0.16 -0.70 2.47 -1.08 0.54 -0.43 -0.69 2.97 -1.11 1.22 0.6852372 0.8151840 2 0.6852372 0.8151840 1
2 2 -0.31 -1.06 2.97 -1.22 0.40 -0.18 -0.38 3.15 -0.99 0.41 0.7331098 0.5208887 1 0.7331098 0.5208887 2
3 3 -0.26 -0.51 2.91 -0.92 0.46 -0.59 -1.19 3.35 -1.26 1.01 0.6117598 1.1180004 2 0.6117598 1.1180004 1
4 4 -0.77 -0.81 3.01 -1.16 0.79 0.64 -0.77 1.52 -0.39 0.24 2.0848381 1.5994765 1 2.0848381 1.5994765 2
Seems they all have changed cluster.
I am working on analysing some data using wage variables. The variable contains symbol '€' and 'M' or 'K'.
I was trying to use gsub() function to address this issue, yet my code doesn't work
Integer_converter <- function(strWage) {
Factor_Wage = gsub("€", " ", strWage)
}
Factor_converter_1 <- function(strWage) {
Integer_Wage = gsub("M", " ", strWage)
}
Factor_converter_2 <- function(strWage) {
Integer_wage = as.integer(as.integer(gsub("K", "", strWage)) / 100)
}
The actual values are listed as follows:
$ Wage /fct/ €405K, €195K, €205K, €240K, €175K, €25K, €205K, €57K, €140K, €135K, €15K, €45K, €40K, €76K, €17K, €125K, …
and I want to convert it into
$ Wage /int/ 0.405, 0.195, 0.205, 0.240, 0.175, 0.025, 0.205, 0.057, 0.140, 0.135, 0.015, 0.045, 0.040, 0.076, 0.017, 0.125, …enter image description here
We can use parse_number from readr to extract the number and divide by 1000.
library(readr)
parse_number(as.character(df1$Wage))/1000
#[1] 0.405 0.195 0.205 0.240 0.175 0.025 0.205 0.057 0.140
#[10] 0.135 0.015 0.045 0.040 0.076 0.017 0.125
It extracts the numeric part and then just divide by 1000
It can also be done with tidyverse chain
library(dplyr)
df1 %>%
mutate(Wage = parse_number(as.character(Wage))/1000)
If there are "M" in addition to "K", we can use gsubfn
library(gsubfn)
unname(sapply(gsubfn("[A-Z]", list(K = '/1e3', M = '/1e6'),
sub("€", "", df2$Wage)), function(x) eval(parse(text = x))))
data
df1 <- data.frame(Wage = c("€405K", "€195K", "€205K", "€240K", "€175K",
"€25K", "€205K", "€57K", "€140K", "€135K", "€15K", "€45K",
"€40K", "€76K", "€17K", "€125K"))
df2 <- data.frame(Wage = c("€405K", "€195K", "€205K", "€240K", "€175K",
"€25K", "€205K", "€57K", "€140K", "€135K", "€15M", "€45K",
"€40K", "€76K", "€17M", "€125K"))
According to the findCorrelation() document I run the official example as shown below:
Code:
library(caret)
R1 <- structure(c(1, 0.86, 0.56, 0.32, 0.85, 0.86, 1, 0.01, 0.74, 0.32,
0.56, 0.01, 1, 0.65, 0.91, 0.32, 0.74, 0.65, 1, 0.36,
0.85, 0.32, 0.91, 0.36, 1),
.Dim = c(5L, 5L))
colnames(R1) <- rownames(R1) <- paste0("x", 1:ncol(R1))
findCorrelation(R1, cutoff = .6, exact = TRUE, names = TRUE
,verbose = TRUE)
Result:
> findCorrelation(R1, cutoff = .6, exact = TRUE, names = TRUE, verbose = TRUE)
## Compare row 1 and column 5 with corr 0.85
## Means: 0.648 vs 0.545 so flagging column 1
## Compare row 5 and column 3 with corr 0.91
## Means: 0.53 vs 0.49 so flagging column 5
## Compare row 3 and column 4 with corr 0.65
## Means: 0.33 vs 0.352 so flagging column 4
## All correlations <= 0.6
## [1] "x1" "x5" "x4"
I have no idea how the computation process works, i. e. why there are first compared row 1 and column 5, and how the mean is calculated, even after I have read the source file.
I hope that someone could explain the algorithm with the help of my example.
First, it determines the average absolute correlation for each variable. Columns x1 and x5 have the highest average (mean(c(0.85, 0.56, 0.32, 0.86)) and mean(c(0.85, 0.9, 0.36, 0.32)) respectively), so it looks to remove one of these on the first step. It finds x1 to be the most globally offensive, so it removes it.
After that, it recomputes and compares x5 and x3 using the same process.
It stops after removing three columns since all pairwise correlations are below your threshold.