Related
I have a script in r to calculate body condition residuals. I would like to apply this code to each columns, which correspond to a specific category of individual.
For example i would run this code to calculate body condition residuals of all individuals that are in the category 1
1) Select rows of interest
Data1 = RawData %>% select(ID,temperature, Bodysize1, mass1, year) %>% filter((temperature %in% c(20:29) & Bodysize1 %in% c(20:100) & mass1 %in% c(15:40))
2) Create a new model with created data
Model1 =lmer(log(mass1) ~ log(Bodysize1) + temperature + (1|year), data = Data1)
3) Extract residuals and add ID to the residuals
ResModel1 = resid(Model1)
ID=Data1$ID
Res1 =data.frame(ResModel1 ,ID)
4) Add residuals to my RawData
RawData2.0 = merge(RawData, Res1, by = c("ID"), all.x = T)
In order to avoid reruning this code and manually changing all the 1 by 2 and then all the 2 by 3... etc is there a way to do this commande automatic whith loops and the apply familly?
My data
ID TEMPERATURE BODYSIZE1 MASS1 BODYSIZE2 MASS2 YEAR
81-012 0.03830645 200 1450 205 1425 1981
84-069 0.26923078 200 1473 205 1498 1984
84-134 0.32692307 209 1448 195 1323 1984
84-145 0.27884614 197 1373 197 1498 1984
84-190 0.31129807 191 1248 195 1323 1984
85-155 0.33056709 198 1637 229 1988 1985
Thanks in advance
Withou example data it is tough to say if this will work but maybe creating a function can simplify your workflow
library(tidyverse)
get_resid <- function(df,filters) {
df_to_model <- df %>% filter({{filters}})
df_to_keep <- df <- filter({{filters}},.preserve = FALSE)
Data1 <- df_to_model %>%
select(ID,temperature, Bodysize1, mass1, year)
Model1 <- lmer(log(mass1) ~ log(Bodysize1) + temperature + (1|year), data = Data1)
ResModel1 <- resid(Model1)
ID <- Data1$ID
Res1 <- data.frame(ResModel1 ,ID)
Res1 %>%
bind_rows(df_to_keep)
}
Then you may use this this function in your pipes
RawData %>%
get_resid(temperature %in% c(20:29) & Bodysize1 %in% c(20:100) & mass1 %in% c(15:40))
You might try to i) create a tibble, in the first column list all dep variables as strings,
ii) list your models of indep vars in the second column,
iii) create a formula in the third column
iv) run your model in the fourth column
df <- tibble(dep = paste0("log(var",seq(1,10,1),")"),
x = "~ your_x_vars") %>%
mutate(formula = as.formula(paste0(dep,x))) %>%
mutate(reg = map(formula, ~lm(as.formula(.x), data=df) ))
then you can easily extract the residuals
I work for an insurance company and I am trying to improve something that I built. I have about 150 data frames that look like this:
library(data.table)
dt_Premium<-data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Base_Premium_Fire= c(45,55,105,92),
Base_Premium_Water= c(20,21,24,29),
Base_Premium_Theft= c(3,5,6,7))
dt_Discount_Factors<-data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Discount_Factor_Fire= c(.9,.95,.99,.97),
Discount_Factor_Water= c(.8,.85,.9,.96),
Discount_Factor_Theft= c(1,1,1,1))
dt_Territory_Factors<-data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Territory_Factor_Fire= c(1.9,1.2,.91,1.03),
Territory_Factor_Water= c(1.03,1.3,1.25,1.01),
Territory_Factor_Theft= c(1,1.5,1,.5))
dt_Fixed_Expense<-data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Fixed_Expense_Fire= c(5,5,5,5),
Fixed_Expense_Water= c(7,7,7,7),
Fixed_Expense_Theft= c(9,9,9,9))
I take the base premium and then I multiply by factors, and then add a fixed expense at the very end. My code is currently something like:
dt_Final_Premium<-cbind(dt_Premium[,1],dt_Premium[,2:4]*
dt_Discount_Factors[,2:4]*
dt_Territory_Factors[,2:4]+
dt_Fixed_Expense[,2:4])
What I hate about this:
-The 2:4 stuff (I would like to be able to use a named range)
-The typing is monstrous considering all of the tables and policies I actually have
-It is very confusing for anybody except me (the author) to understand and edit/adjust the code
-I would like to be able to have each rating step as part of a list, and then just iterate over that list (or a similar process).
-Ideally I would be able to get the values at each step. For example :
step2_answer<-cbind(dt_Premium[,1],dt_Premium[,2:4]*
dt_Discount_Factors[,2:4])
There just has to be a way were I can take a dataframe/datatable and then just multiply or add to the next dataframe/datatable in the series. Thanks for taking a look at this?
How about something like this using dplyr?!
Here I am using the same calculation that you have mentioned but row wise using mutate function of dplyr which makes it clear to see the step by step and for anyone to understand the calculation easily.
library(data.table)
library(dplyr)
dt_Premium <- data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Base_Premium_Fire= c(45,55,105,92),
Base_Premium_Water= c(20,21,24,29),
Base_Premium_Theft= c(3,5,6,7))
dt_Discount_Factors <- data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Discount_Factor_Fire= c(.9,.95,.99,.97),
Discount_Factor_Water= c(.8,.85,.9,.96),
Discount_Factor_Theft= c(1,1,1,1))
dt_Territory_Factors <- data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Territory_Factor_Fire= c(1.9,1.2,.91,1.03),
Territory_Factor_Water= c(1.03,1.3,1.25,1.01),
Territory_Factor_Theft= c(1,1.5,1,.5))
dt_Fixed_Expense <- data.table(Policy = c("Pol123","Pol333","Pol555","Pol999"),
Fixed_Expense_Fire= c(5,5,5,5),
Fixed_Expense_Water= c(7,7,7,7),
Fixed_Expense_Theft= c(9,9,9,9))
dt_Final_Premium <- cbind(dt_Premium[,1],dt_Premium[,2:4]*
dt_Discount_Factors[,2:4]*
dt_Territory_Factors[,2:4]+
dt_Fixed_Expense[,2:4])
new_dt_final_premium <-
dt_Premium %>%
# Joining all tables together
left_join(dt_Discount_Factors, by = "Policy") %>%
left_join(dt_Territory_Factors, by = "Policy") %>%
left_join(dt_Fixed_Expense, by = "Policy") %>%
# Calculating required calculation
mutate(
Base_Premium_Fire =
Base_Premium_Fire * Discount_Factor_Fire * Territory_Factor_Fire + Fixed_Expense_Fire,
Base_Premium_Water =
Base_Premium_Water * Discount_Factor_Water * Territory_Factor_Water + Fixed_Expense_Water,
Base_Premium_Theft =
Base_Premium_Theft * Discount_Factor_Theft * Territory_Factor_Theft + Fixed_Expense_Theft) %>%
select(Policy, Base_Premium_Fire, Base_Premium_Water, Base_Premium_Theft)
Since your columns have a clean naming, some pivoting may do the work:
library(tidyverse) #to be run after library(data.table)
dt_Premium %>%
left_join(dt_Discount_Factors, by="Policy") %>%
left_join(dt_Territory_Factors, by="Policy") %>%
left_join(dt_Fixed_Expense, by="Policy") %>%
pivot_longer(cols=-Policy)%>%
separate(name, into=c("name", "object"), sep="_.*_") %>%
pivot_wider() %>%
mutate(total=Base*Discount*Territory+Fixed) %>% #or calculate the value for a specific step
select(Policy, object, total) %>%
pivot_wider(names_from = "object", values_from = "total")
After joining all the columns, you can pivot to a long format and turn columns to rows. There, you can separate the name into the real name (Base, Discount, Fixed...) and the object (Fire, Water, ...) and return to the wide format. The tricky part is to get a good regular expression, as your names use the underscore twice. Mine can be vastly improved but will do the work for now.
After this, you can calculate whatever you want, select only the result and pivot to wide one last time. If you want to get all the results, you may tweak this last pivot with prefixes.
Pivoting is quite a gymnastics, but it has proven to be very effective once you get used to it.
As you have a lot of tables, if you can get them as a list, you can also use purrr::reduce to join them all at once and simplify the first lines of code:
list(dt_Premium, dt_Discount_Factors, dt_Territory_Factors, dt_Fixed_Expense) %>%
reduce(left_join, by='Policy') %>%
pivot_longer(cols=-Policy)%>%
separate(name, into=c("name", "object"), sep="_.*_") %>%
pivot_wider() %>%
mutate(total=Base*Discount*Territory+Fixed) %>% #of calculate the value for a specific step
select(Policy, object, total) %>%
pivot_wider(names_from = "object", values_from = "total")
Another option is to reorganize the data by converting into a long format, merge and then perform the calculations:
DT <- Reduce(merge, lapply(dtList, function(d) {
vn <- sub('_([^_]*)$', '', names(d)[2L]) #see reference [1]
melt(d, id.vars="Policy", value.name=vn)[,
variable := gsub("(.*)_(.*)_(.*)", "\\3", variable)]
}))
DT
DT[, disc_prem := Base_Premium * Discount_Factor][,
disc_prem_loc := disc_prem * Territory_Factor][,
Final_Premium := disc_prem_loc + Fixed_Expense]
output:
Policy variable Base_Premium Discount_Factor Territory_Factor Fixed_Expense disc_prem disc_prem_loc Final_Premium
1: Pol123 Fire 45 0.90 1.90 5 40.50 76.9500 81.9500
2: Pol123 Theft 3 1.00 1.00 9 3.00 3.0000 12.0000
3: Pol123 Water 20 0.80 1.03 7 16.00 16.4800 23.4800
4: Pol333 Fire 55 0.95 1.20 5 52.25 62.7000 67.7000
5: Pol333 Theft 5 1.00 1.50 9 5.00 7.5000 16.5000
6: Pol333 Water 21 0.85 1.30 7 17.85 23.2050 30.2050
7: Pol555 Fire 105 0.99 0.91 5 103.95 94.5945 99.5945
8: Pol555 Theft 6 1.00 1.00 9 6.00 6.0000 15.0000
9: Pol555 Water 24 0.90 1.25 7 21.60 27.0000 34.0000
10: Pol999 Fire 92 0.97 1.03 5 89.24 91.9172 96.9172
11: Pol999 Theft 7 1.00 0.50 9 7.00 3.5000 12.5000
12: Pol999 Water 29 0.96 1.01 7 27.84 28.1184 35.1184
data:
dtLs <- list(dt_Premium, dt_Discount_Factors, dt_Territory_Factors, dt_Fixed_Expense)
Reference:
regex-return-all-before-the-second-occurrence
I am guessing reading some of rdata.table vignettes would help you tighten up syntax and make it more terse. Some of us think terse = 'more readable' in numeric programming. Others think that represents some level of insanity:
vignette(package="data.table")
Understanding Map, Reduce, mget and other functional notation in R and rdata.table may help. Here are some things I have done from a data.table mindset:
Dropping cols syntax might be more terse using 'i' to drop a vector of cols:
dt[is.na(dt)] <- 0 # replace NA with 0
drop_col_list <- c('dropcol1','dropcol2','dropcol3') # drop col list
# dt <- dt[!drop_col_list,sapply(dt,as.numeric)] # make selected dt cols numeric type
dt[!drop_col_list,SumCol := Reduce(`+`, dt)] # adds Sum col with 'functional programming' iteration
The lapply(.SD, func) format is very powerful:
fsum <- function(x) {sum(x,na.rm=TRUE)}
dt[,lapply(.SD,fsum),by=,.SDcols=c("col1","col2","col3","col4")]
# or
dt[!drop_col_list,lapply(.SD,fsum)]
This shows applying the internal data.table 'set' function (':=') and mget to create cols derived from operations with functional programming on two data.tables. The data.table(s) may need to have the same nrow():
nm1 <- names(dt1)[1:4]
nm2 <- names(dt2)[1:4]
dt[, SumCol := Reduce(`+`, Map(`*`, mget(nm1), mget(nm2)))]
The loop below isn't really rdata.table'esq' programming but outputs a data.table. Probably this isn't as fast as more data.table like syntax:
seqXpi <- function(x) {x * pi}
seqXexp <- function(x) {x * exp(1)}
l <- {};
for(x in seq(1,10,1)) l <- as.data.table(rbind(l,cbind(seq=x,seqXpi=seqXpi(x),seqXexp=seqXexp(x))))
I've configured the data how I needed but it took 15 lines of code. I was sure it could be done in 1 or 2, and I'm hoping someone a lot better at this can teach me how. Here it is...
I have a table with 11 variables that consists of a Date, 4 pairs of spread and price observations, followed by the year and quarter corresponding to the data column. The 4 pairs of data each correspond to different TBA mortgage coupons (3%, 3.5%, 4%, 4.5%).
mbstrimlast table
I need the 8 columns to be in 2 columns named ZSpread and Price, and then each pair tagged with the coupon Type.
Here's the code I used. Thanks!
mbs3 <- mbstrimlast[,c("Date",ZSpread="FN3sprd",Price="FN3px")]
names(mbs3) <- c("Date","Zspread","Price")
mbs3.5 <- mbstrimlast[,c("Date",ZSpread="FN3.5sprd",Price="FN3.5px")]
names(mbs3.5) <- c("Date","Zspread","Price")
mbs4 <- mbstrimlast[,c("Date",ZSpread="FN4sprd",Price="FN4px")]
names(mbs4) <- c("Date","Zspread","Price")
mbs4.5 <- mbstrimlast[,c("Date",ZSpread="FN4.5sprd",Price="FN4.5px")]
names(mbs4.5) <- c("Date","Zspread","Price")
mbs3$Type = c("FN3")
mbs3.5$Type = c("FN3.5")
mbs4$Type = c("FN4")
mbs4.5$Type = c("FN4.5")
mbslast = bind_rows(mbs3, mbs3.5, mbs4, mbs4.5)
mbslast <- mbslast %>% mutate(Yeartag = year(mbslast$Date))
mbslast <- mbslast %>% mutate(Qtag = quarters(mbslast$Date, abbreviate = T))
We can use the tidyverse package to make the code to complete this task a bit cleaner. First, we use gather to reshape from wide to long, then we create type and key columns using grepl and gsub, finally, we use spread to get the data back into a tidier format.
library(tidyverse)
mbstrimlast %>%
gather(variable, value, -Date, -Yeartag, -Qrts) %>% # wide to long
# column creation
mutate(type = ifelse(grepl(pattern = 'sprd', x = variable), 'Spread', 'Price'),
key = gsub(pattern = 'sprd|px', replacement = '', x = variable)) %>%
select(-variable) %>% # remove variable column
spread(type, value) # tidier
Date Yeartag Qrts key Price Spread
1 2018-06-17 23:00:00 2018 Q2 FN3 96.35938 52.8
2 2018-06-17 23:00:00 2018 Q2 FN3.5 99.10938 67.7
3 2018-06-17 23:00:00 2018 Q2 FN4 101.64844 81.9
4 2018-06-17 23:00:00 2018 Q2 FN4.5 103.89062 87.2
Here my dataframe:
df = read.csv(text = '"Date","Value","ID","WY"
1975-02-01,-1.16543693088,"Tweed",1975
1975-03-01,-1.05372283483,"Tweed",1975
1975-04-01,-1.06632370439,"Tweed",1975
1975-05-01,-1.18903485356,"Tweed",1975
1992-05-01,-1.04737467143,"Ouse",1992
1992-06-01,-1.4058281451,"Ouse",1992
1992-07-01,-1.13608647243,"Ouse",1992
1992-08-01,-0.802566581309,"Ouse",1992
1992-09-01,-0.551433852821,"Ouse",1992
1992-10-01,-0.625997598552,"Ouse",1993
1992-11-01,-0.483559758609,"Ouse",1993
1992-12-01,-0.792013395632,"Ouse",1993
1993-01-01,-0.754618121962,"Ouse",1993
1993-02-01,-1.2504282139,"Ouse",1993
1996-01-01,-0.945410385985,"Trent",1996
1996-02-01,-0.84249575782,"Trent",1996
1996-03-01,-1.10332425045,"Trent",1996
1996-04-01,-1.22634133042,"Trent",1996
1996-05-01,-1.2335181635,"Trent",1996
1996-06-01,-1.23451130358,"Trent",1996
1996-07-01,-1.25902677738,"Trent",1996
1996-08-01,-1.13068733413,"Trent",1996', header = TRUE)
I need to find the annual maximum value for each ID and WY group.
The following code do the trick very easily but its output only shows the year of each annual maximum whereas I am interested also in the relative month and day:
df_AMAX = aggregate(df$Value, by = list(df$WY, df$ID), max)
colnames(df_AMAX) = c('Date', 'ID', 'Value')
print(df_AMAX)
Date ID Value
1 1992 Ouse -0.5514339
2 1993 Ouse -0.4835598
3 1996 Trent -0.8424958
4 1975 Tweed -1.0537228
My output should be:
Date ID Value
1 1992-09-01 Ouse -0.5514339
2 1993-11-01 Ouse -0.4835598
3 1996-02-01 Trent -0.8424958
4 1975-03-01 Tweed -1.0537228
It should be a silly thing but please let me know if you have any suggestion.
Thanks
Use subset with ave. Note that the function passed to ave returns a logical but ave will coerce it to the class of Value so we use !! to make it logical again. No packages are used.
mx_all <- function(x) if (length(x)) x == max(x)
subset(df, !!ave(Value, ID, WY, FUN = mx_all))
or
mx_first <- function(x) if (length(x)) seq_along(x) == which.max(x)
subset(df, !!ave(Value, ID, WY, FUN = mx_first))
These give the same answer for the sample input and will always give the same answer if there is a unique maximum in each group but if there are multiple maxima in a group then the first one gives all of them and the second gives the first.
There is of course a dplyr solution, too:
df %>%
group_by(WY, ID) %>%
summarise(
Value = max(Value),
Date = Date[which.max(Value)]) %>%
ungroup() %>%
select(ID:Date)
A dataset in R looks like below:
LD.D LD.L LD.P
Y.1992.a1 67.89552605 33.21192862 90.7750688
Y.1992.a2 227.1370541 79.67211036 154.5165077
Y.1992.a3 94.5326718 24.72816922 151.665545
Y.1992.a4 106.8793485 56.07635245 100.6711004
Y.1992.a5 97.41402289 46.93434073 100.8787496
Y.1993.a1 150.045093 19.64290196 27.81953228
Y.1993.a2 106.5888189 21.38886866 84.82532249
Y.1993.a3 110.7493543 25.41765759 70.02222315
Y.1993.a4 237.1246502 16.43006029 75.17407065
Y.1993.a5 234.5403261 16.93082727 49.01639754
Y.1994.a1 94.5326718 24.72816922 151.665545
Y.1994.a2 106.8793485 56.07635245 100.6711004
Y.1994.a3 97.41402289 46.93434073 100.8787496
Y.1994.a4 150.045093 19.64290196 27.81953228
Y.1994.a5 106.5888189 21.38886866 84.82532249
For each year I have got five replicates. The question is how could I have the aveage of each single year (e.g., 1992 and 1993 and 1994)?
You could do this using either base R or with specialized packages such as dplyr or data.table (more efficient when the dataset is really big).
df$Year <- gsub("^.\\.(\\d+)\\..*", "\\1", row.names(df)) #extracted the year alone from the row names and created a column `Year` in the dataset
library(dplyr)
df %>%
group_by(Year) %>% #grouped by Year variable
summarise_each(funs(mean=mean(., na.rm=TRUE))) #when you specify the function, `summarise_each will applies the function (here it is mean) to each of the columns in the dataset or a subset of columns (if specified)
# Source: local data frame [3 x 4]
# Year LD.D LD.L LD.P
#1 1992 118.7717 48.12458 119.70139
#2 1993 167.8096 19.96206 61.37151
#3 1994 111.0920 33.75413 93.17205
Using data.table. Convert to data.table using setDT and use lapply on a Subset of Data.table (.SD) columns and get the mean. Use by to specify the grouping variable Year.
library(data.table)
setDT(df)[, lapply(.SD, mean, na.rm=TRUE), by=Year]
# Year LD.D LD.L LD.P
#1: 1992 118.7717 48.12458 119.70139
#2: 1993 167.8096 19.96206 61.37151
#3: 1994 111.0920 33.75413 93.17205
Or using base R. There are different ways aggregate, by, split etc. Here is one with by. Use regex (lookbehind) to get the Year. In this case, I am getting the Y prefix also as it doesn't affect the results.
Year <- gsub("(?<=[0-9])\\..*$", "", row.names(df), perl=TRUE)
do.call(`rbind`,by(df, Year, FUN= colMeans, na.rm=TRUE))
# LD.D LD.L LD.P
#Y.1992 118.7717 48.12458 119.70139
#Y.1993 167.8096 19.96206 61.37151
#Y.1994 111.0920 33.75413 93.17205
data
df <- structure(list(LD.D = c(67.89552605, 227.1370541, 94.5326718,
106.8793485, 97.41402289, 150.045093, 106.5888189, 110.7493543,
237.1246502, 234.5403261, 94.5326718, 106.8793485, 97.41402289,
150.045093, 106.5888189), LD.L = c(33.21192862, 79.67211036,
24.72816922, 56.07635245, 46.93434073, 19.64290196, 21.38886866,
25.41765759, 16.43006029, 16.93082727, 24.72816922, 56.07635245,
46.93434073, 19.64290196, 21.38886866), LD.P = c(90.7750688,
154.5165077, 151.665545, 100.6711004, 100.8787496, 27.81953228,
84.82532249, 70.02222315, 75.17407065, 49.01639754, 151.665545,
100.6711004, 100.8787496, 27.81953228, 84.82532249)), .Names = c("LD.D",
"LD.L", "LD.P"), class = "data.frame", row.names = c("Y.1992.a1",
"Y.1992.a2", "Y.1992.a3", "Y.1992.a4", "Y.1992.a5", "Y.1993.a1",
"Y.1993.a2", "Y.1993.a3", "Y.1993.a4", "Y.1993.a5", "Y.1994.a1",
"Y.1994.a2", "Y.1994.a3", "Y.1994.a4", "Y.1994.a5"))
Try aggregate where DF is the data frame:
aggregate(DF, list(Year = gsub("^Y.|.[^.]*$", "", rownames(DF))), mean)