How to detect and remove outliers within loop in R? - r

Hope someone can send help for a desperate student :-)
I have a set of procedure codes for which I have a different number of surgeries (here: procedures) with their respective durations. I would like to get some descriptive statistics on the durations. For that, I would like my loop to already detect and remove the outliers by IQR function. This is the code without outlier detection and removal:
# variables for output - run before each loop
Counter0<-1
Procedure_codes<-NULL
Number<-NULL
Min_Times<-NULL
Max_Times<-NULL
Average_Times<-NULL
Median_Times<-NULL
SD_Times<-NULL
#loop over all procedure codes
while(Counter0<=number_of_different_procedurecodes) {
a_g_procedures2<-NULL
Procedure_Name<-eval(list_of_procedurecodes[Counter0])
Procedure_name<-unlist(Procedure_Name)
print(Procedure_Name)
a_g_procedures2$Duration<-NULL
Durations<-NULL
number_of_procedures<-0
#Subset data for the specific procedure
a_g_procedures2<-subset(a_g_procedures1,ProcedureCode==Procedure_Name)
number_of_procedures<-length(a_g_procedures2$ProcedureCode)
Counter1<-1
#loop over specific procedure
while(Counter1<=number_of_procedures){
a_g_procedures$Duration<-NULL
TimeIn_1_Selected<-a_g_procedures2$"TimeIn_1"[Counter1]
TimeIn_1_Selected<-as.POSIXct(TimeIn_1_Selected,format="%d/%m/%Y %H:%M")
TimeIn_1_S<-as.numeric(TimeIn_1_Selected)
TimeIn_2_Selected<-a_g_procedures2$"TimeIn_2"[Counter1]
TimeIn_2_Selected<-as.POSIXct(TimeIn_2_Selected,format="%d/%m/%Y %H:%M")
TimeIn_2_S<-as.numeric(TimeIn_2_Selected)
TimeOut_Selected<-a_g_procedures2$"TimeOut"[Counter1]
TimeOut_Selected<-as.POSIXct(TimeOut_Selected,format="%d/%m/%Y %H:%M")
if (TimeIn_1_S>TimeIn_2_S) {
Start_Time<-TimeIn_2_Selected
}
if (TimeIn_1_S<=TimeIn_2_S) {
Start_Time<-TimeIn_1_Selected
}
print (Start_Time)
print(TimeOut_Selected)
Duration<-difftime(TimeOut_Selected, Start_Time, units = "mins")
Durations<-c(Durations,Duration)
Counter1<-Counter1+1
}
Procedure_codes<-c(Procedure_codes,Procedure_name)
Durations<-as.numeric(Durations)
Mean_Time<-mean(Durations, digits=1)
SD_Time<-sd(Durations,na.rm=TRUE)
Min_Time<-min(Durations, na.rm=TRUE)
Max_Time<-max(Durations, na.rm=TRUE)
Median_Time<-median(Durations, na.rm=TRUE)
Average_Times<-c(Average_Times,Mean_Time)
SD_Times<-c(SD_Times,SD_Time)
Min_Times<-c(Min_Times, Min_Time)
Max_Times<-c(Max_Times, Max_Time)
Median_Times<-c(Median_Times, Median_Time)
Number<-c(Number,number_of_procedures)
Counter0<-Counter0+1
}
ag_output<-data.frame(Procedure_codes,Number,Min_Times, Max_Times, Average_Times, Median_Times, SD_Times)
This is what I would have liked to add to the loop over specific procedure:
Q<-quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr<-IQR(Duration)
up<-Q[2]+1.5*iqr
low<-Q[1]-1.5*iqr
remove<-Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
setdiff(Duration, remove)
Does somebody have an idea how I could do this?
Thank you very much in advance!

make it a function?
f.remove_outliers_IQR <- function(Duration)
{
Q <- quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr <- IQR(Duration)
up <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
remove <- Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
Duration_out <- setdiff(Duration, remove)
return(Duration_out)
}
and call it in the main loop, maybe just before Counter1<-Counter1+1?

Related

Creating a function to loop columns through an equation in R

Solution (thanks #Peter_Evan!) in case anyone coming across this question has a similar issue
(Original question is below)
## get all slopes (lm coefficients) first
# list of subfields of interest to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# dependent variables are sf, independent variable common to all models in the inner lm() call is ICV
# applies the lm(subfield ~ ICV, dataset = DF) to all subfields of interest (sf) specified previously
lm.results <- lapply(sf, function(dv) {
temp.lm <- lm(get(dv) ~ ICV, data = DF)
coef(temp.lm)
})
# returns a list, where each element is a vector of coefficients
# do.call(rbind, ) will paste them together
lm.coef <- data.frame(sf = sf,
do.call(rbind, lm.results))
# tidy up name of intercept variable
names(lm.coef)[2] <- "intercept"
lm.coef
## set up all components for the equation
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(DF))
# name the rows after each subject
row.names(out) <- DF$Subject
# name the columns after each subfield
colnames(out) <- sf
# nested for loop that goes by subject (j) and subfield (i)
for(j in DF$Subject){
for (i in sf) {
slope <- lm.coef[lm.coef$sf == i, "ICV"]
out[j,i] <- as.numeric( DF[DF$Subject == j, i] - (slope * (DF[DF$Subject == j, "ICV"] - mean(DF$ICV))) )
}
}
# check output
out
===============
Original Question:
I have a dataframe (DF) with 13 columns (12 different brain subfields, and one column containing total intracranial volume(ICV)) and 50 rows (each a different participant). I'm trying to automate an equation being looped over every column for each participant.
The data:
structure(list(Subject = c("sub01", "sub02", "sub03", "sub04",
"sub05", "sub06", "sub07", "sub08", "sub09", "sub10", "sub11",
"sub12", "sub13", "sub14", "sub15", "sub16", "sub17", "sub18",
"sub19", "sub20"), ICV = c(1.50813, 1.3964237, 1.6703585, 1.4641886,
1.6351018, 1.5524641, 1.4445532, 1.6384505, 1.6152434, 1.5278011,
1.4788126, 1.4373356, 1.4109637, 1.3634952, 1.3853583, 1.4855268,
1.6082085, 1.5644998, 1.5617522, 1.4304141), left_subiculum = c(411.225013,
456.168033, 492.968477, 466.030173, 533.95505, 476.465524, 448.278213,
476.45566, 422.617374, 498.995121, 450.773906, 461.989663, 549.805272,
452.619547, 457.545623, 451.988333, 475.885847, 490.127968, 470.686415,
494.06548), left_CA1 = c(666.893596, 700.982955, 646.21927, 580.864234,
721.170599, 737.413139, 737.683665, 597.392434, 594.343911, 712.781376,
733.157168, 699.820162, 701.640861, 690.942843, 606.259484, 731.198846,
567.70879, 648.887718, 726.219904, 712.367433), left_presubiculum = c(325.779458,
391.252815, 352.765098, 342.67797, 390.885737, 312.857458, 326.916867,
350.657957, 325.152464, 320.718835, 273.406949, 305.623938, 371.079722,
315.058313, 311.376271, 319.56678, 348.343569, 349.102678, 322.39908,
306.966008), `left_GC-ML-DG` = c(327.037756, 305.63224, 328.945065,
238.920358, 319.494513, 305.153183, 311.347404, 259.259723, 295.369164,
312.022281, 324.200989, 314.636501, 306.550385, 311.399107, 295.108592,
356.197094, 251.098248, 294.76349, 317.308576, 301.800253), left_CA3 = c(275.17038,
220.862237, 232.542718, 170.088695, 234.707172, 210.803287, 246.861975,
171.90896, 220.83478, 236.600832, 246.842024, 239.677362, 186.599097,
224.362411, 229.9142, 293.684776, 172.179779, 202.18936, 232.5666,
221.896625), left_CA4 = c(277.614028, 264.575987, 286.605092,
206.378619, 281.781858, 258.517989, 269.354864, 226.269982, 256.384436,
271.393257, 277.928824, 265.051581, 262.307377, 266.924683, 263.038686,
306.133918, 226.364556, 262.42823, 264.862956, 255.673948), right_subiculum = c(468.762375,
445.35738, 446.536018, 456.73484, 521.041823, 482.768261, 487.2911,
456.39996, 445.392976, 476.146498, 451.775611, 432.740085, 518.170065,
487.642399, 405.564237, 487.188989, 467.854363, 479.268714, 473.212833,
472.325916), right_CA1 = c(712.973011, 717.815214, 663.637105,
649.614586, 711.844375, 779.212704, 862.784416, 648.925038, 648.180611,
760.761704, 805.943016, 717.486756, 801.853608, 722.213109, 621.676321,
791.672796, 605.35667, 637.981476, 719.805053, 722.348921), right_presubiculum = c(327.285242,
364.937865, 288.322641, 348.30058, 341.309111, 279.429847, 333.096795,
342.184296, 364.245998, 350.707173, 280.389853, 276.423658, 339.439377,
321.534798, 302.164685, 328.365751, 341.660085, 305.366589, 320.04127,
303.83284), `right_GC-ML-DG` = c(362.391907, 316.853532, 342.93274,
282.550769, 339.792696, 357.867386, 342.512721, 277.797528, 309.585721,
343.770416, 333.524912, 302.505077, 309.063135, 291.29361, 302.510461,
378.682679, 255.061044, 302.545288, 313.93902, 297.167161), right_CA3 = c(307.007404,
243.839349, 269.063801, 211.336979, 249.283479, 276.092623, 268.183349,
202.947849, 214.642782, 247.844657, 291.206598, 235.864996, 222.285729,
201.427853, 237.654913, 321.338801, 199.035108, 243.204203, 236.305659,
213.386702), right_CA4 = c(312.164065, 272.905586, 297.99392,
240.765062, 289.98697, 306.459566, 284.533068, 245.965817, 264.750571,
296.149675, 290.66935, 264.821461, 264.920869, 246.267976, 266.07378,
314.205819, 229.738951, 274.152503, 256.414608, 249.162404)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
The equation:
adjustedBrain(participant1) = rawBrain(participant1) - slope*[ICV(participant1) - (mean of all ICV measures included in the calculation of the slope)]
The code (which is not working and I was hoping for some pointers):
adjusted_Brain <- function(DF, subject) {
subfields <- colnames(select(DF, "left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG"))
out <- matrix(ncol = length(subfields), nrow = NROW(DF))
for (i in seq_along(subfields)) {
DF[i] = DF[DF$Subject == "subject", "i"] -
slope * (DF[DF$Subject == "subject", "ICV"] -
mean(DF$ICV))
}
}
Getting this error:
Error: Can't subset columns that don't exist.
x Column `i` doesn't exist.
A few notes:
The slopes for each subject for each subfield will be different (and will come from a regression) -> is there a way to specify that in the function so the slope (coefficient from the appropriate regression equation) gets called in?
I have my nrow set to the number of participants right now in the output because I'd like to have this run through EVERY subject across EVERY subfield and spit out a matrix with all the adjusted brain volumes... But that seems very complicated and so for now I will just settle for running each participant separately.
Any help is greatly appreciated!
As others have noted in the comments, there are quite a few syntax issues that prevent your code from running, as well as a few unstated requirements. That aside, I think there is enough to recommend a few improvements that you can hopefully build on. Here are the top line changes:
You likely don't need this to be a function, but rather a nested for loop (if you want to do this with base R). As written, the code isn't flexible enough to merit a function. If you intend to apply this many times across different datasets, a function might make sense. However, it will require a much larger rewrite.
Assuming you are fitting a simple regression via lm, then you can pull out the coefficient of interest via the $ operator and indexing (see below). Some thought will need to go into how to handle different models in the loop. Here, we assume you only need one coefficient from one model.
There are a few areas where the syntax is incorrect and a review of sub setting in base R would be helpful. Others have pointed out in the comments were some of these are.
Here is one approach were we loop through each subject (j) through each feature or subfield (i) and store them in a matrix (out). This is just an approach and will almost certainly need tweaking on your end!
#NOTE: the dataset your provided is saved as x in this example.
#fit a linear model - here we assume there is only one coef. of interest, but you may need to alter
# depending on how the slope changes in each calculation
reg <- lm(ICV ~ right_CA3, x)
# view the coeff.
reg$coefficients
# pull out the slope by getting the coeff. of interest (via index) from the reg object
slope <- reg$coefficients[[1]]
# list of features/subfeilds to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(x))
#name the rows after each subject
row.names(out) <- x$Subject
#name the columns after each sub feild
colnames(out) <- sf
# nested for loop that goes by subject (j) and features/subfeilds (i)
for(j in x$Subject){
for (i in sf) {
out[j,i] <- as.numeric( x[x$Subject == j, i] - (slope * (x[x$Subject == j, "ICV"] - mean(x$ICV))) )
}
}
# check output
out

Subsetting a data set and plotting means

I have a data set including Year, Site, and Species Count. I am trying to write a code that reflects in some years, the counts were done twice. For those years I have to find the mean count at each site for each species (there are two different species), and plot those means. This is the code I have generated:
DataSet1 <- subset(channel_islands,
channel_islands$SpeciesName=="Hypsypops ubicundus, adult" |
channel_islands$SpeciesName=="Paralabrax clathratus,adult")
years<-unique(DataSet1$Year)
Hypsypops_mean <- NULL
Paralabrax_mean <- NULL
Mean <- NULL
years <- unique(DataSet1$Year)
for(i in 1:length(years)){
data_year <- DataSet1[which(DataSet1$Year == years[i]), ]
Hypsypops<-data_year[which(data_year$SpeciesName=="Hypsypops rubicundus,adult"), ]
Paralabrax<-data_year[which(data_year$SpeciesName=="Paralabrax clathratus,adult"), ]
UNIQUESITE<-unique(unique(data_year$Site))
for(m in 1:(length(UNIQUESITE))){
zz<-Hypsypops[Hypsypops$Site==m,]
if(length(zz$Site)>=2){
Meanp <- mean(Hypsypops$Count[Hypsypops$Site==UNIQUESITE[m]])
Hypsypops_mean <- rbind(Hypsypops_mean,
c(UNIQUESITE[m], years[i], round(Meanp,2),
'Hypsypops rubicundus,adult'))
}
kk <- Paralabrax[Paralabrax$Site==m, ]
if(length(kk$Site)>=2){
Meane <- mean(Paralabrax$Count[Paralabrax$Site==UNIQUESITE[m]])
Paralabrax_mean <- rbind(Paralabrax_mean,
c(UNIQUESITE[m], years[i], round(Meane, 2),
'Paralabrax clathratus,adult'))
}
}
if(i==1){
Mean<-rbind(Hypsypops_mean, Paralabrax_mean)
}
if(i>1){
Mean<-rbind(DataMean, Hypsypops_mean, Paralabrax_mean)
}
Hypsypops_mean<-NULL
Paralabrax_mean<-NULL
}
Mean <- as.data.frame(Mean,stringsAsFactors=F)
names(Mean) <- c('Site','Year','mean_count','SpeciesName')
Mean$Site <- as.integer(Mean$Site)
Mean$Year <- as.integer(Mean$Year)
Mean$mean_count <- as.numeric(Mean$mean_count)
par(mfrow=c(5,5), oma=c(4,2,4,2), mar=c(5.5,4,3,0))
for(i in 1:length(years)){
if(any(Mean$Year==years[i])) {
year1<-Mean[which(Mean$Year==years[i]),]
Species<-unique(as.character(year1$SpeciesName))
Colors<-c("pink","purple")[Species]
Data_Hr<-year1[year1$SpeciesName=="Hypsypops rubicundus,adult",]
Data_Pc<-year1[year1$SpeciesName=="Paralabrax clathratus,adult",]
plot(Data_Hr$mean_count~Data_Pc$mean_count,
xlab=c("Hypsypops rubicundus"),
ylab=c("Paralabrax clathratus"),main=years[i],pch=16)
}
}
It's a lot I'm sorry, I'm not sure of a way to streamline the process. But I keep getting an error:
Error in names(Mean) <- c("Site", "Year", "mean_count", "SpeciesName")
: 'names' attribute [4] must be the same length as the vector [0]
Not sure how I can debug this.
Not sure why you want to do this with an elaborate loop code. It sounds like you are trying to summarise your data.
This can be done in different ways. Here is a solution using dplyr:
DataSet1 %>%
group_by(Year, SpeciesName, Site) %>%
summarise(nrecords = n(),
Count = mean(Count))
To get a better answer, it might be helpful to post a subset of the data and the intended result you are after.

How to create a loop that changes part of a column name in a data frame

I am trying to find Cronbach's Alpha for survey data containing a series of multi-item measures. Rather than have to manually write out every single multi-item measure, it looks like something a loop should be able to manage far more effectively, but it needs to change only part of the column name, according to the question number.
The basic idea as it currently sits in my head would be...
for (N in 4:22) {
ytqN <- data.frame(YT_Data$QNa, YT_Data$QNb, YT_Data$QNc)
alpha(ytqN)
}
The loop would then create new data frames for each multi item measure and run Cronbach's Alpha as it goes.
This doesn't work though. :(
ytq4 <- data.frame(YT_Data$Q4a, YT_Data$Q4b, YT_Data$Q4c)
alpha(ytq4)
ytq5 <- data.frame(YT_Data$Q5a, YT_Data$Q5b, YT_Data$Q5c)
alpha(ytq5)
ytq6 <- data.frame(YT_Data$Q6a, YT_Data$Q6b, YT_Data$Q6c)
alpha(ytq6)
ytq7 <- data.frame(YT_Data$Q7a, YT_Data$Q7b, YT_Data$Q7c)
alpha(ytq7)
ytq8 <- data.frame(YT_Data$Q8a, YT_Data$Q8b, YT_Data$Q8c)
alpha(ytq8)
ytq9 <- data.frame(YT_Data$Q9a, YT_Data$Q9b, YT_Data$Q9c)
alpha(ytq9)
ytq10 <- data.frame(YT_Data$Q10a, YT_Data$Q10b, YT_Data$Q10c)
alpha(ytq10)
ytq11 <- data.frame(YT_Data$Q11a, YT_Data$Q11b, YT_Data$Q11c)
alpha(ytq11)
ytq12 <- data.frame(YT_Data$Q12a, YT_Data$Q12b, YT_Data$Q12c)
alpha(ytq12)
ytq13 <- data.frame(YT_Data$Q13a, YT_Data$Q13b, YT_Data$Q13c)
alpha(ytq13)
ytq14 <- data.frame(YT_Data$Q14a, YT_Data$Q14b, YT_Data$Q14c)
alpha(ytq14)
ytq15 <- data.frame(YT_Data$Q15a, YT_Data$Q15b, YT_Data$Q15c)
alpha(ytq15)
ytq16 <- data.frame(YT_Data$Q16a, YT_Data$Q16b, YT_Data$Q16c)
alpha(ytq16)
ytq17 <- data.frame(YT_Data$Q17a, YT_Data$Q17b, YT_Data$Q17c)
alpha(ytq17)
ytq18 <- data.frame(YT_Data$Q18a, YT_Data$Q18b, YT_Data$Q18c)
alpha(ytq18)
ytq19 <- data.frame(8 - YT_Data$Q19a, YT_Data$Q19b, YT_Data$Q19c)
# Reverse code Q19a
alpha(ytq19)
ytq20 <- data.frame(YT_Data$Q20a, YT_Data$Q20b, YT_Data$Q20c)
alpha(ytq20)
ytq21 <- data.frame(YT_Data$Q21a, YT_Data$Q21b, YT_Data$Q21c)
alpha(ytq21)
ytq22 <- data.frame(YT_Data$Q22a, YT_Data$Q22b, YT_Data$Q22c)
alpha(ytq22)
The desired results would be a single output containing all the Cronbach's Alphas for the multi item measures for questions 4-22 in the data set I am currently working on executed via a single piece of code, rather than have to go question by question.
It's easier to help if you include your data, but I guess this should work:
alpha_list = list()
for(N in 4:22){
ytq = data.frame(YT_Data[paste0("Q",N,"a")],
YT_Data[paste0("Q",N,"b")],
YT_Data[paste0("Q",N,"c")])
alpha_list[[N]] = alpha(ytq)
}
We are using paste0() to create the column names while looping on N. alpha_list will be a list with the results given by alpha()

SMA using R & TTR Package

Afternoon! I'm just starting out with R and learning about data frames, packages, etc... read a lot of the messages here but couldn't find an answer.
I have a table I'm accessing with R that has the following fields:
[Symbol],[Date],[Open],[High],[Low],[Close],[Volume]
And, I'm calculating SMAs on the close prices:
sqlQuery <- "Select * from [dbo].[Stock_Data]"
conn <- odbcDriverConnect(connectionString)
dfSMA <- sqlQuery(conn, sqlQuery)
sma20 <- SMA(dfSMA$Close, n = 20)
dfSMA["SMA20"] <- sma20
When I look at the output, it appears to be calculating the SMA without any regard for what the symbol is. I haven't tried to replicate the calculation, but I would suspect it's just doing it by 20 moving rows, regardless of date/symbol.
How do I restrict the calculation to a given symbol?
Any help is appreciated - just need to be pointed in the right direction.
Thanks
You're far more likely to get answers if you provide reproducible examples. First, let's replicate your data:
library(quantmod)
symbols <- c("GS", "MS")
getSymbols(symbols)
# Create example data:
dGS <- data.frame("Symbol" = "GS", "Date" = index(GS), coredata(OHLCV(GS)))
names(dGS) <- str_replace(names(dGS), "GS\\.", "")
dMS <- data.frame("Symbol" = "MS", "Date" = index(MS), coredata(OHLCV(MS)))
names(dMS) <- str_replace(names(dMS), "MS\\.", "")
dfSMA <- rbind(dGS, dMS)
> head(dfSMA)
Symbol Date Open High Low Close Volume Adjusted
1 GS 2007-01-03 200.60 203.32 197.82 200.72 6494900 178.6391
2 GS 2007-01-04 200.22 200.67 198.07 198.85 6460200 176.9748
3 GS 2007-01-05 198.43 200.00 197.90 199.05 5892900 177.1528
4 GS 2007-01-08 199.05 203.95 198.10 203.73 7851000 181.3180
5 GS 2007-01-09 203.54 204.90 202.00 204.08 7147100 181.6295
6 GS 2007-01-10 203.40 208.44 201.50 208.11 8025700 185.2161
What you want to do is subset your long data object, and then apply technical indicators on each symbol in isolation. Here is one approach to guide you toward acheiving your desired result.
You could do this using a list, and build the indicators on xts data objects for each symbol, not on a data.frame like you do in your example (You can apply the TTR functions to columns in a data.frame but it is ugly -- work with xts objects is much more ideal). This is template for how you could do it. The final output l.data should be intuitive to work with. Keep each symbol in a separate "Container" (element of the list) rather than combining all the symbols in one data.frame which isn't easy to work with.
make_xts_from_long_df <- function(x) {
# Subset the symbol you desire
res <- dfSMA[dfSMA$Symbol == x, ]
#Create xts, then allow easy merge of technical indicators
x_res <- xts(OHLCV(res), order.by = res$Date)
merge(x_res, SMA(Cl(x_res), n = 20))
}
l.data <- setNames(lapply(symbols, make_xts_from_long_df), symbols)

linking crsp and compustat in R via WRDS

I am using R to connect to WRDS. Now, I would like to link compustat and crsp tables. In SAS, this would be achieved using macros and the CCM link table. What would be the best way to approach this topic in R?
PROGRESS UPDATE:
I downloaded crsp, compustat and ccm_link tables from wrds.
sql <- "select * from CRSP.CCMXPF_LINKTABLE"
res <- dbSendQuery(wrds, sql)
ccmxpf_linktable <- fetch(res, n = -1)
ccm.dt <- data.table(ccmxpf_linktable)
rm(ccmxpf_linktable)
I am then converting the suggested matching routine from the wrds event study sas file into R:
ccm.dt[,typeflag:=linktype %in% c("LU","LC","LD","LN","LS","LX") & USEDFLAG=="1"]
setkey(ccm.dt, gvkey, typeflag)
for (i in 1:nrow(compu.dt)) {
gvkey.comp = compu.dt[i, gvkey]
endfyr.comp = compu.dt[i,endfyr]
PERMNO.val <- ccm.dt[.(gvkey.comp, TRUE),][linkdt<=endfyr.comp & endfyr.comp<=linkenddt,lpermno]
if (length(PERMNO.val)==0) PERMNO.val <- NA
suppressWarnings(compu.dt[i, "PERMNO"] <- PERMNO.val)
}
However, this code is fantastically inefficient. I started out with data.table, but do not really understand how to apply the logic in the for-loop. I am hoping that some could point me to a way how to improve the for-loop.
Matching fields in stages works better. maybe someone finds this useful. Any suggestions for further improvement are of course very welcome!!!
# filter on ccm.dt
ccm.dt <- ccm.dt[linktype %in% c("LU","LC","LD","LN","LS","LX") & USEDFLAG=="1"]
setkey(ccm.dt, gvkey)
setkey(compu.dt, gvkey)
compu.merged <- merge(compu.dt, ccm.dt, all.x = TRUE, allow.cartesian = TRUE)
# deal with NAs in linkenddt - set NAs to todays date, assuming they still exist.
today <- as.character(Sys.Date())
compu.merged[is.na(linkenddt), "linkenddt":=today]
# filter out date mismatches
compu <- compu.merged[linkdt <= endfyr & endfyr<=linkenddt]

Resources