Issue with loop: argument of length 0 - r

I've tried working on this loop and come out with the below errors. I'm not sure if I can provide data, if needed I'll do my best to obfuscate the data. Here is the loop I am trying to use, any tips on what I'm doing wrong would be greatly appreciated as I haven't found a viable solution yet. The exact error is below the code.
decay_function = function(df)
{
df <- df[order(df$department,df$product,df$region,df$monthnum),]
for(mk in 1:ncol(levels_department)) {
newdata <- df[which(df$department==as.character(levels_department[,mk])), ]
levels_product<-as.data.frame(t(levels(as.factor(newdata$product))))
for(md in 1:ncol(levels_product)){
newdata <- newdata[which(newdata$product==as.character(levels_product[,md])), ]
levels_region<-as.data.frame(t(levels(as.factor(newdata$region))))
for(dm in 1:ncol(levels_region)){
newdata <- newdata[which(newdata$region==as.character(levels_region[,dm])), ]
for(i in 1:(nrow(newdata)-1)){
start_month = newdata$monthnum[i]
end_month = newdata$monthnum[nrow(newdata)]
row_vector = c()
decay_vector = c()
for(j in 5:ncol(newdata)){
k = 0
for(l in start_month:end_month){
distance_initial = (l - start_month)
vector_increment = (l - (start_month-1))
decay_rate = (0.5)^((1/halflife)*distance_initial)
decay_value = (decay_rate)*(newdata[[i,j]])
k = k + decay_value
}
df2[i,j] = k
}
print(df2)
}
if (mk=='1' & md=='1' & dm=='1'){
outdata<-df2
} else {
outdata<-rbind(outdata,df2)
}
}
}
}
}
output_data = decay_function(tempone)
Error in start_month:end_month : argument of length 0
> dput(head(df))
structure(list(monthnum = c(33, 33, 33, 33, 33, 33), Region = c(2251,
2251, 2251, 2251, 2251, 2251), Department = c("Softlines", "Softlines",
"Softlines", "Softlines", "Softlines", "Softlines"), Product = c("T-Shirt",
"Jacket", "Sweat Shirt", "Tank Top", "Sweat Pants", "Mens Jeans"
), Incentive_Amount = c(5742.43, 108006.61, 459076.67, 34006,
141632.42, 29580.38), Leads_T1 = c(0, 0, 0, 0, 0, 0), DCLeads = c(0,
1, 0, 0, 0, 0), PhoneLeads = c(0, 0, 0, 0, 0, 0), T3_CRM_Leads = c(0,
0, 0, 0, 0, 0), Leads_Third = c(0, 1, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L))

Related

R optim() constraint optimization does not find the first best

my problem is summarized in finding a vector X with the best solution to the problem:
L is the profits,
R is the restrictions,
P is a constraint parameters matrix,
max SUM_i (x_i * l_i)
or max(t(L)%*%X)
restriction
SUM_i(x_i*p_ij)<=r_j
or P%*%X <= R.
I find a solution for X, but not the best, which would be
fb = c(.217,0,0,23,2865,0,13,427).
How do I find the best solution?
code:
X<-matrix(rep(1,6),6,1)
P<-matrix(c(
1, 1, 1, 2, 0, 0,
0, 1, 1, 2, 1, 1,
99.4, 37.75, 19.75, 54.40, 74.75, 53,
2.400, 1.540, 0, 0, 0, 0,
2.400, 1.960, 0, 0, 0, 0,
1.800, 3.300, 5.330, 0, 0, 0,
0, 0, 2.070, 0, 8.700, 0,
0, 0, .436, 0, 19.100, 12.363,
0, 3.000, .364, 0, 9.100, 26.737 ),
9,6,1)
L <- matrix(c(83.4, 72.35, 27.3, 72.05, 217.25, 455), 6,1)
R <- matrix(c(60,60,2000,351,448,479,338,424,359),9,1)
farm<- function(par, P,R, L){
trues<- P%*%par<=R
if (min(trues)==1 && min(par)>=0) {
return(-t(L)%*%par)
}
else{
return(0)
}
}
mtds = c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", "SANN","Brent")
out <- optim(par = X, # initial guess
fn = farm,
P = P,
R = R ,
L = L,
method = mtds[5])
# my result
t(L)%*%out$par
#A matrix: 1 × 1 of type dbl
#7419.596
# the first best
fb<- matrix(c(.217,0,0,23.2865,0,13.427),6,1)
t(L)%*%fb
#A matrix: 1 × 1 of type dbl
#7805.175
I think you can try fmincon from package pracma
library(pracma)
objfun <- function(x) -t(L)%*%x
res <- fmincon(x0 = X,fn = objfun,A = P,b = R,lb = rep(0,length(X)))
and you will see that
> res$par
[1] 4.201711e-16 -1.239088e-15 1.863081e-17 2.310286e+01
[5] 5.566620e-01 1.323762e+01
> -res$value
[,1]
[1,] 7808.615
That looks very much like a model that could be solved by a linear programme.
library("Rglpk")
Rglpk_solve_LP(obj = L,
mat = P,
dir = rep("<=", 9),
rhs = R,
max = TRUE)

Create faceted xy scatters using vectors of column names in R

I have two character vectors of equal length; where position one in vector.x matches position one in vector.y and so on. The elements refer to column names in a data frame (wide format). I would like to somehow loop through these vectors to produce xy scatter graphs for each pair in the vector, preferably in a faceted plot. Here is a (hopefully) reproducible example. To be clear, with this example, I would end up with 10 scatter graphs.
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
structure(list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176,
0.0108639420589757), Marinobacter = c(0, 0.00219023779724656,
0, 0.00137867647058824, 0.00310398344542162), Neptuniibacter = c(0.00945829750644884,
0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393
), Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856
), Pseudomonas = c(0.00466773123694878, 0.00782227784730914,
0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856
), Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529,
0.147697878944646), unclassified_GpIIa = c(0, 0.000730079265748853,
0, 0.003125, 0.00103466114847387), unclassified_Porticoccus = c(0,
0, 0, 0.00119485294117647, 0.00569063631660631), Aplanochytrium = c(0,
0, 0, 0.000700770847932726, 0.0315839846865529), Bathycoccus = c(0.000388802488335925,
0, 0, 0.0227750525578136, 0.00526399744775881), Brockmanniella = c(0,
0.00383141762452107, 0, 0.000875963559915907, 0), Caecitellus_paraparvulus = c(0,
0, 0, 0.000875963559915907, 0.00797575370872547)), row.names = c("B11",
"B13", "B22", "DI5", "FF6"), class = "data.frame")
As Rui Barradas shows, it's possible to get a very nice plot from ggplot and gridExta. If you wanted to stick to base R, here's how you'd do that (assuming your data set is called df1):
# set plot sizes
par(mfcol = c(floor(sqrt(length(vector.x))), ceiling(sqrt(length(vector.x)))))
# loop through plots
for (i in 1:length(vector.x)) {
plot(df1[[vector.x[i]]], df1[[vector.y[i]]], xlab = vector.x[i], ylab = vector.y[i])
}
# reset plot size
par(mfcol = c(1,1))
This is a bit long and convoluted but it works.
library(tidyverse)
library(gridExtra)
df_list <- apply(data.frame(vector.x, vector.y), 1, function(x){
DF <- df1[which(names(df1) %in% x)]
i <- which(names(DF) %in% vector.x)
if(i == 2) DF[2:1] else DF
})
gg_list <- lapply(df_list, function(DF){
ggplot(DF, aes(x = get(names(DF)[1]), y = get(names(DF)[2]))) +
geom_point() +
xlab(label = names(DF)[1]) +
ylab(label = names(DF)[2])
})
g <- do.call(grid.arrange, gg_list)
g
Not too elegant, but should get you going:
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
df1 = structure(
list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176, 0.0108639420589757),
Marinobacter = c(0, 0.00219023779724656, 0, 0.00137867647058824, 0.00310398344542162),
Neptuniibacter = c(0.00945829750644884, 0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393),
Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856),
Pseudomonas = c(0.00466773123694878, 0.00782227784730914, 0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856),
Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529, 0.147697878944646),
unclassified_GpIIa = c(0, 0.000730079265748853, 0, 0.003125, 0.00103466114847387),
unclassified_Porticoccus = c(0, 0, 0, 0.00119485294117647, 0.00569063631660631),
Aplanochytrium = c(0, 0, 0, 0.000700770847932726, 0.0315839846865529),
Bathycoccus = c(0.000388802488335925, 0, 0, 0.0227750525578136, 0.00526399744775881),
Brockmanniella = c(0, 0.00383141762452107, 0, 0.000875963559915907, 0),
Caecitellus_paraparvulus = c(0, 0, 0, 0.000875963559915907, 0.00797575370872547)),
row.names = c("B11", "B13", "B22", "DI5", "FF6"),
class = "data.frame"
)
df2 = NULL
for(i in 1:10) {
df.tmp = data.frame(
plot = paste0(vector.x[i], ":", vector.y[i]),
x = df1[[vector.x[i]]],
y = df1[[vector.y[i]]]
)
if(is.null(df2)) df2=df.tmp else df2 = rbind(df2, df.tmp)
}
ggplot(data=df2, aes(x, y)) +
geom_point() +
facet_grid(cols = vars(plot))

Can I convert a certain string upon reading with fread in R data.table?

I am trying to use R's data.table package to read a large data set (~800k rows). The set contains results from a simulation of 1000 scenarios (+ a scenario "0" - so 1,001 scenarios in total) and one of the columns, "ScenId", contains the number of the scenario, e.g. 0,1,2,..
The problem is the program used to output this txt file cannot name scenario 1000 as '1000', but uses 'AAA' instead. The column 'ScenId' thus contains only numbers, apart from the value 'AAA'.
I am trying to find a solution to convert 'AAA' to 1000 preferably within the fread command.
My current workaround is using na.strings = "AAA" in fread and then replacing the NA's with 1000, after reading is complete. This works well because those are the only NA instances in the data set.
However, I was hoping for a quicker / more elegant solution, i.e. to do this within the fread command.
Any help / advice will be much appreciated.
Later edit: an attempt at posting sample data.
structure(list(ScenId = "AAA", SensId = "_", SystemProd = "ZCPP__",
AssumClass = "SPLPSV", ProjPer = 40L, ProjMode = "Annual",
VarName = "belLUL", Description = "(BEL)",
Module = "MLIAB", FormType = "inv", Group = "calc.BEL", Width = 12L,
Decimals = 2L, Scale = "Yes", Value000 = 0, Value001 = 0,
Value002 = 0, Value003 = 0, Value004 = 0, Value005 = 0, Value006 = 0,
Value007 = 0, Value008 = 0, Value009 = 0, Value010 = 0, Value011 = 0,
Value012 = 0, Value013 = 0, Value014 = 0, Value015 = 0, Value016 = 0,
Value017 = 0, Value018 = 0, Value019 = 0, Value020 = 0, Value021 = 0,
Value022 = 0, Value023 = 0, Value024 = 0, Value025 = 0, Value026 = 0,
Value027 = 0, Value028 = 0, Value029 = 0, Value030 = 0, Value031 = 0,
Value032 = 0, Value033 = 0, Value034 = 0, Value035 = 0, Value036 = 0,
Value037 = 0, Value038 = 0, Value039 = 0, Value040 = 0), .Names =("ScenId",
"SensId", "SystemProd", "AssumClass", "ProjPer", "ProjMode",
"VarName", "Description", "Module", "FormType", "Group", "Width",
"Decimals", "Scale", "Value000", "Value001", "Value002", "Value003",
"Value004", "Value005", "Value006", "Value007", "Value008", "Value009",
"Value010", "Value011", "Value012", "Value013", "Value014", "Value015",
"Value016", "Value017", "Value018", "Value019", "Value020", "Value021",
"Value022", "Value023", "Value024", "Value025", "Value026", "Value027",
"Value028", "Value029", "Value030", "Value031", "Value032", "Value033",
"Value034", "Value035", "Value036", "Value037", "Value038", "Value039",
"Value040"), class = c("data.table", "data.frame"), row.names = c(NA,
-1L), .internal.selfref = <pointer: 0x0000000000310788>)
This is just one line of my data set. Hope this makes sense.

How to change value of row if doesn't match the row above in a function applied to a dataframe

I have a dataframe as follows
structure(list(chr = 1, leftPos = 240000, OC_AH_026C.res = 0,
OC_AH_026C.1.res = 0, OC_AH_026C.2.res = 0, OC_AH_026T.res = 0,
OC_AH_058T.res = 0, OC_AH_084C.res = 0, OC_AH_084T.res = 0,
OC_AH_086C.res = 0, OC_AH_086C.1.res = 0, OC_AH_086C.2.res = 0,
OC_AH_086C.3.res = 0, OC_AH_086T.res = 0, OC_AH_088C.res = 0,
OC_AH_088T.res = 0, OC_AH_094C.res = 0, OC_AH_094C.1.res = 0,
OC_AH_094C.2.res = 0, OC_AH_094C.3.res = 0, OC_AH_094C.4.res = 0,
OC_AH_094C.5.res = 0, OC_AH_094C.6.res = 0, OC_AH_094C.7.res = 0,
OC_AH_094T.res = 0, OC_AH_096C.res = 0, OC_AH_096T.res = 0,
OC_AH_100C.res = 0, OC_AH_100C.1.res = 0, OC_AH_100T.res = 0,
OC_AH_127C.res = 0, OC_AH_127T.res = 0, OC_AH_133C.res = 0,
OC_AH_133T.res = 0, OC_ED_008C.res = 0, OC_ED_008C.1.res = 0,
OC_ED_008C.2.res = 0, OC_ED_008C.3.res = 0, OC_ED_008T.res = 0,
OC_ED_016C.res = 0, OC_ED_016T.res = 0, OC_ED_031C.res = 0,
OC_ED_031T.res = 0, OC_ED_036C.res = 0, OC_ED_036T.res = 0,
OC_GS_001C.res = 0, OC_GS_001T.res = 0, OC_QE_062C.res = 0,
OC_QE_062T.res = 0, OC_RS_010C.res = 0, OC_RS_010T.res = 0,
OC_RS_027C.res = 0, OC_RS_027C.1.res = 0, OC_RS_027C.2.res = 0,
OC_RS_027T.res = 0, OC_SH_051C.res = 0, OC_SH_051T.res = 0,
OC_ST_014C.res = 0, OC_ST_014C.1.res = 0, OC_ST_014T.res = 0,
OC_ST_016T.res = 0, OC_ST_020C.res = 0, OC_ST_020T.res = 0,
OC_ST_024C.res = 0, OC_ST_024T.res = 0, OC_ST_033C.res = 0,
OC_ST_033T.res = 0, OC_ST_034C.res = 0, OC_ST_034C.1.res = 0,
OC_ST_034C.2.res = 0, OC_ST_034T.res = 0, OC_ST_035C.res = 0,
OC_ST_035T.res = 0, OC_ST_036C.res = 0, OC_ST_036T.res = 0,
OC_ST_037T.res = 0, OC_ST_040C.res = 0, OC_ST_040T.res = 0,
OC_WG_001T.res = 0, OC_WG_002C.res = 0, OC_WG_002T.res = 0,
OC_WG_005C.res = 0, OC_WG_005T.res = 0, OC_WG_006C.res = 0,
OC_WG_006T.res = 0, OC_WG_009T.res = 0, OC_WG_019C.res = 0,
OC_WG_019T.res = 0, Means.res = 0, sd.res = 0, ind = 1L), .Names = c("chr",
"leftPos", "OC_AH_026C.res", "OC_AH_026C.1.res", "OC_AH_026C.2.res",
"OC_AH_026T.res", "OC_AH_058T.res", "OC_AH_084C.res", "OC_AH_084T.res",
"OC_AH_086C.res", "OC_AH_086C.1.res", "OC_AH_086C.2.res", "OC_AH_086C.3.res",
"OC_AH_086T.res", "OC_AH_088C.res", "OC_AH_088T.res", "OC_AH_094C.res",
"OC_AH_094C.1.res", "OC_AH_094C.2.res", "OC_AH_094C.3.res", "OC_AH_094C.4.res",
"OC_AH_094C.5.res", "OC_AH_094C.6.res", "OC_AH_094C.7.res", "OC_AH_094T.res",
"OC_AH_096C.res", "OC_AH_096T.res", "OC_AH_100C.res", "OC_AH_100C.1.res",
"OC_AH_100T.res", "OC_AH_127C.res", "OC_AH_127T.res", "OC_AH_133C.res",
"OC_AH_133T.res", "OC_ED_008C.res", "OC_ED_008C.1.res", "OC_ED_008C.2.res",
"OC_ED_008C.3.res", "OC_ED_008T.res", "OC_ED_016C.res", "OC_ED_016T.res",
"OC_ED_031C.res", "OC_ED_031T.res", "OC_ED_036C.res", "OC_ED_036T.res",
"OC_GS_001C.res", "OC_GS_001T.res", "OC_QE_062C.res", "OC_QE_062T.res",
"OC_RS_010C.res", "OC_RS_010T.res", "OC_RS_027C.res", "OC_RS_027C.1.res",
"OC_RS_027C.2.res", "OC_RS_027T.res", "OC_SH_051C.res", "OC_SH_051T.res",
"OC_ST_014C.res", "OC_ST_014C.1.res", "OC_ST_014T.res", "OC_ST_016T.res",
"OC_ST_020C.res", "OC_ST_020T.res", "OC_ST_024C.res", "OC_ST_024T.res",
"OC_ST_033C.res", "OC_ST_033T.res", "OC_ST_034C.res", "OC_ST_034C.1.res",
"OC_ST_034C.2.res", "OC_ST_034T.res", "OC_ST_035C.res", "OC_ST_035T.res",
"OC_ST_036C.res", "OC_ST_036T.res", "OC_ST_037T.res", "OC_ST_040C.res",
"OC_ST_040T.res", "OC_WG_001T.res", "OC_WG_002C.res", "OC_WG_002T.res",
"OC_WG_005C.res", "OC_WG_005T.res", "OC_WG_006C.res", "OC_WG_006T.res",
"OC_WG_009T.res", "OC_WG_019C.res", "OC_WG_019T.res", "Means.res",
"sd.res", "ind"), class = c("data.table", "data.frame"), row.names = c(NA,
-1L), .internal.selfref = <pointer: 0x103006f78>)
For each column I would like to keep the value as it is if it agrees with the row above it for that chr, only if that value is 1 or -1. If there is no agreement I'd like to convert the value to zero.
For example (not using the dput above)
chr leftPos OC_030_ST.res
1 4324 0
1 23433 1
1 34436 1
1 64755 1
3 234 1
3 354 0
4 1666 0
4 4565 0
5 34777 1
7 2345 1
7 4567 1
should become
chr leftPos OC_030_ST.res
1 4324 0
1 23433 1
1 34436 1
1 64755 1
3 234 0
3 354 0
4 1666 0
4 4565 0
5 34777 0
7 2345 1
7 4567 1
I had a dataframe (called Final) once upon a time that had a column called Def that contained all the res values in one column so I could do something like
ContZ<-setDT(Final)[,ind:=rleid(Def)][, if(.N>1) .SD, .(chr, ind)][, ind:=NULL][]
but assuming I'm going to need apply I'm not sure how to use this.
I tried:
MeOut<-lapply(df_list2res,function (col){
ContZ<-setDT(df_list2res)[,ind:=rleid(col)][, if(.N>1) .SD, .(chr, ind)][, ind:=NULL][]
})
but I get the error when I try to View(MeOut) that
Error in View : arguments imply differing number of rows:
I suspect this is because I have been getting rid of rows rather than replacing values although I can't be sure....
Here's your simple example in dplyr.
library(dplyr)
#create a simple version of your df
df<- data.frame(c(1,1,1,1,3,3,4,4,5,7,7),c(0,1,1,1,1,0,0,0,1,1,1))
names(df) <- c("chr","OC_030_ST.res")
df2 <- df%>%
mutate(last=lag(chr))%>%
mutate(OC_030_ST.res =ifelse(chr == last | is.na(last),
ifelse(OC_030_ST.res == 1|OC_030_ST.res == -1,OC_030_ST.res,0),0))%>%
select(-last)
df2
The logic here is that if the chr value one behind the current value equals the last and the 'OC_030_ST.res' value is 1 or -1, the value will be retained. In all other cases, the value is reset to 0. Please let me know if this isn't the logic you intended.
Note, the first item is a special case (as there can't be a lag on the first row), hence the is.na(last) check catches this.
edit: I realised you may want to apply this to multiple columns in your data frame. The below will let you do this
df<- data.frame(c(1,1,1,1,3,3,4,4,5,7,7),c(0,1,1,1,1,0,0,0,1,1,1),c(0,1,1,1,1,0,0,0,1,1,1))
names(df) <- c("chr","OC_030_ST.res","OC_031_ST.res")
df$count<-1:nrow(df)
make0 <- function(x) {ifelse(x == 1 | x == -1, x <-0, x <- x)}
dftemp <- df%>%
mutate(last=lag(chr))%>%
mutate(flag=ifelse(chr == last | is.na(last),0,1))%>%
dplyr::filter(flag==1)%>%
#the below column range will need to be changed to the columns you actually need to change
mutate_each(funs(make0),2:3)%>%
select(-c(flag,last))
dftemp
#update the original dataframe with the modified values
df[match(dftemp$count, df$count), ] <- dftemp
df <- subset(df,select=-count)
df

Tried streamlining w/ SDCols - got "longer object length is not a multiple of shorter object length"

I have tried searching stackoverflow and google to get answers to my question, but I couldn't find anything that applied closely enough for me to be able to apply it. However, I'm very new to R, so it's likely that I may just need a little walking through it.
If I use the following code, it works just fine.
> dput(b)
structure(list(DUMP_END_SHIFT_DATE = structure(c(1420070400,
1420070400, 1420156800, 1420156800, 1420243200, 1420243200, 1420329600,
1420329600, 1420416000, 1420416000, 1420502400), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), QUANTITY_REPORTING = c(235, 219, 232,
219, 219, 219, 219, 219, 219, 219, 235), WTRECV = c(32.71, 32.71,
20.19, 33.42, 21.61, 21.61, 21.61, 20.19, 21.61, 20.19, 24.2),
LC12 = c(0, 0, 0, 94, 100, 100, 100, 0, 100, 0, 100), LC34 = c(0,
100, 0, 6, 0, 0, 0, 0, 0, 0, 0), LC5 = c(0, 0, 5, 0, 0, 0,
0, 5, 0, 5, 0), HIS = c(25, 0, 60, 0, 0, 0, 0, 60, 0, 60,
0), UC = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IBC = c(75,
0, 35, 0, 0, 0, 0, 35, 0, 35, 0)), .Names = c("DUMP_END_SHIFT_DATE",
"QUANTITY_REPORTING", "WTRECV", "LC12", "LC34", "LC5", "HIS",
"UC", "IBC"), class = c("data.table", "data.frame"), row.names = c(NA,
-11L), .internal.selfref = <pointer: 0x0000000005860788>)
library(data.table)
b_daily <- b[,.(d_tons=sum(QUANTITY_REPORTING)),by=DUMP_END_SHIFT_DATE]
b_daily[,"d_WTRECV" := b[,.(d_WTRECV=sum(QUANTITY_REPORTING*WTRECV)),by=DUMP_END_SHIFT_DATE] [,.(round(d_WTRECV/d_tons, digits=2))]]
b_daily[,"d_LC12" := b[,.(d_LC12=sum(QUANTITY_REPORTING*LC12)),by=DUMP_END_SHIFT_DATE] [,.(round(d_LC12/d_tons, digits=2))]]
b_daily[,"d_LC34" := b[,.(d_LC34=sum(QUANTITY_REPORTING*LC34)),by=DUMP_END_SHIFT_DATE] [,.(round(d_LC34/d_tons, digits=2))]]
b_daily[,"d_LC5" := b[,.(d_LC5=sum(QUANTITY_REPORTING*LC5)),by=DUMP_END_SHIFT_DATE] [,.(round(d_LC5/d_tons, digits=2))]]
b_daily[,"d_HIS" := b[,.(d_HIS=sum(QUANTITY_REPORTING*HIS)),by=DUMP_END_SHIFT_DATE] [,.(round(d_HIS/d_tons, digits=2))]]
b_daily[,"d_UC" := b[,.(d_UC=sum(QUANTITY_REPORTING*UC)),by=DUMP_END_SHIFT_DATE] [,.(round(d_UC/d_tons, digits=2))]]
b_daily[,"d_IBC" := b[,.(d_IBC=sum(QUANTITY_REPORTING*IBC)),by=DUMP_END_SHIFT_DATE] [,.(round(d_IBC/d_tons, digits=2))]]
However, it seems very inelegant - I think that I should be able to do this using SD and SDcols. I tried the following, just as a test case:
b_daily2 <- b[,lapply(.SD, function (x) sum(x*b[,QUANTITY_REPORTING])/sum(b[,QUANTITY_REPORTING])), by=DUMP_END_SHIFT_DATE, .SDcols=c("WTRECV")] [,.(DUMP_END_SHIFT_DATE,d_WTRECV=round(WTRECV, digits=2))]
The resulting numbers are a little off, and I get the following warning:
"In x * MQD[, QUANTITY_REPORTING] : longer object length is not a multiple of shorter object length"
I understand that this indicates recycling due to objects being different lengths...but I don't understand why or what. Any help would be much appreciated. I apologize in advance if this is an elementary question. Thank you.
This is arguably also inelegant, but at least fits into a single operation:
b_daily <- b[,{
d_tons = sum(QUANTITY_REPORTING)
d_WTRECV = round( sum(QUANTITY_REPORTING*WTRECV)/d_tons, digits = 2 )
list(d_tons = d_tons, d_WTRECV = d_WTRECV)
},by=DUMP_END_SHIFT_DATE]
If there are many columns like d_WTRECV, with names stored in cols = c("WTRECV",...), then...
cols <- c("WTRECV","LC12","LC34","LC5","HIS","UC","IBC")
b_daily2 <- b[,{
d_tons = sum(QUANTITY_REPORTING)
res = lapply(mget(cols), function(x)
round( sum(QUANTITY_REPORTING*x)/d_tons, digits = 2 )
)
c(list(d_tons = d_tons), setNames(res, paste0("d_",cols)))
},by=DUMP_END_SHIFT_DATE]
A similar approach using .SDcols will be possible when a bug related to it is fixed.
Aside. I think there is a feature request to allow for the first column to be used in computing the second, like
# NON-WORKING CODE:
b_daily <- b[,.(
d_tons = sum(QUANTITY_REPORTING),
d_WTRECV = round( sum(QUANTITY_REPORTING*WTRECV) / d_tons, digits = 2)
),by=DUMP_END_SHIFT_DATE]
This is how mutate in the dplyr package works. However, for your multicolumn case, dplyr is more of a hassle than a help, as far as I can figure.
By the way, you may want to wait on rounding. Usually, it's only a good idea for printing purposes and just unnecessarily worsens your later calculations.
I don't think there is a particularly elegant way to do this. Here's a quick take.
sdc <- c("WTRECV", "LC12", "LC34", "LC5", "HIS", "UC", "IBC")
b2 <- copy(b)
b2[, (sdc) := lapply(.SD, "*", b2[, QUANTITY_REPORTING]), .SDcols=sdc]
b_daily <- b2[, lapply(.SD, sum), by=DUMP_END_SHIFT_DATE]
data.table(
b_daily[, .(DUMP_END_SHIFT_DATE)],
b_daily[, lapply(lapply(.SD, "/", b_daily[,QUANTITY_REPORTING]), round, 2), .SDcols=sdc]
)

Resources