I am trying to run a for loop for the below scenario
I have 100k Unique customers in my transactional table which are going to n number of stores each customer.
I am trying to loop through each customer and seeing in which unique store they are going and if new store has opened within 20 kms range to that store then he/she will go to that store and I will make their Value 1 in new data frame created.
I am initializing my code in first for loop and then repeating the same for rest of my data frame.
My code is extremely slow with for loop. I don't know how to vectorise my code. Below is the snapshot of my code. Please guide me how to make this code faster and efficient.
cust_id = c(unique(kk$Customer_ID))
i = cust_id[1]
# for initializing
s = c(0,0,0,0)
df_temp = kk[kk$Customer_ID == i]
store = c(unique(df_temp$Store_Code))
system.time(
for (j in store){
if(df_temp[Store_Code == j]$dist.km298 < 20) {
s[1] <- 1
}
if(df_temp[Store_Code == j]$dist.km299 < 20) {
s[2] <- 1
}
if(df_temp[Store_Code == j]$dist.km300 < 20) {
s[3] <- 1
}
if(df_temp[Store_Code == j]$dist.km301 < 20) {
s[4] <- 1
}
}
)
vishal <- data.table("Customer_ID" = c(i,i,i,i) , "Store_Code" =
c(60298,60299,60300,60301), "Prediction" = s)
cust_id <- cust_id[!cust_id %in% c(cust_id[1])]
# loop for all customers
count = 1
system.time(for (k in 1:length(cust_id)){
i <- cust_id[k]
# count <- count+1
# if (count == 5) {
# break
#}
s = c(0,0,0,0)
df_temp = kk[kk$Customer_ID == i]
store = c(unique(df_temp$Store_Code))
for (j in store){
#if(df_temp$Store_Code == j & df_temp$Purchase_2016 != 0 &
df_temp$Purchase_2017 == 0){
if(df_temp[Store_Code == j]$dist.km298 < 20) {
s[1] <- 1
}
if(df_temp[Store_Code == j]$dist.km299 < 20) {
s[2] <- 1
}
if(df_temp[Store_Code == j]$dist.km300 < 20) {
s[3] <- 1
}
if(df_temp[Store_Code == j]$dist.km301 < 20) {
s[4] <- 1
}
}
v_temp <- data.table("Customer_ID" = c(i,i,i,i) , "Store_Code" =
c(60298,60299,60300,60301), "Prediction" = s)
l = list(vishal,v_temp)
vishal <- rbindlist(l)
}
)
dput(head(kk, 5))
structure(list(Customer_ID =
structure(c(1800000006365760, 1800000006365820,1800000006366060
,1800000006366060,1800000006366060), class = "integer64"), Store_Code =
c(60067, 60054, 60066,
60069, 60079), Purchase_2016 = c(2L, 1L, 1L, 1L, 2L), Purchase_2017 =
c(2L,
0L, 0L, 0L, 0L), TotalPurchases = c(4L, 1L, 1L, 1L, 2L), Return_2016 =
c(0L,
0L, 0L, 0L, 0L), Return_2017 = c(0L, 0L, 0L, 0L, 0L), Return_2010 = c(0L,
0L, 0L, 0L, 0L), Rp_Ratio_2016 = c(0, 0, 0, 0, 0), Rp_Ratio_2017 = c(0,
0, 0, 0, 0), Sales_Per_Day = c(1699.6, 2101.1, 1331.4, 1813.1,
1193.1), Store_Launch_Date = structure(c(1323820800, 1322006400,
1338163200, 1311984000, 1385164800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Store_Size_Sq_Ft = c(8673.5, 12425.5, 15897.1,
6698.1, 3699.5), Customer_Count = c(89351, 118444, 79249, 114246,
54832), Total_Revenue = c(35350868.4, 43702303, 27693164.7, 37712369.7,
24816886.2), dist.km298 = c(140.24, 123.87, 10.2, 131.96, 128.52
), dist.km299 = c(163.37, 140.2, 79.32, 153.01, 145.03), dist.km300 =
c(4.09,
21.05, 126.55, 7.03, 17.41), dist.km301 = c(5.72, 19.04, 125.46,
5.02, 15.4), Nationality = c("INDIA", "UNITED ARAB EMIRATES",
"SRI LANKA", "SRI LANKA", "SRI LANKA"), Gender = c("M", "F",
"M", "M", "M"), Marital_Status = c("Married", "Married", "Married",
"Married", "Married"), Loyalty_Status = c("Gold", "Silver", "Silver",
"Silver", "Silver"), Points = c(814L, 212L, 186L, 186L, 186L),
Age = c(59L, 119L, 59L, 59L, 59L), LastVisit = c(2, 28, 3,
3, 3), Last_rdm_txn_dt1 = structure(c(17601, 16510, 17196,
17196, 17196), class = "Date"), Last_accr_txn_dt1 = structure(c(17801,
17029, 17774, 17774, 17774), class = "Date")), .Names = c("Customer_ID",
"Store_Code", "Purchase_2016", "Purchase_2017", "TotalPurchases",
"Return_2016", "Return_2017", "Return_2010", "Rp_Ratio_2016",
"Rp_Ratio_2017", "Sales_Per_Day", "Store_Launch_Date",
"Store_Size_Sq_Ft",
"Customer_Count", "Total_Revenue", "dist.km298", "dist.km299",
"dist.km300", "dist.km301", "Nationality", "Gender", "Marital_Status",
"Loyalty_Status", "Points", "Age", "LastVisit", "Last_rdm_txn_dt1",
"Last_accr_txn_dt1"), sorted = "Customer_ID", class = c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer:
0x0000000004810788>)
Please read the guide to posting good questions on StackOverflow, it will allow people to answer your questions easily. You question is very confusing but this might give you some ideas:
data %>%
group_by(Customer_ID, Store_Code) %>%
mutate(Predition298 = ifelse(dist.km298 > 20, 1, 0),
Predition299 = ifelse(dist.km299 > 20, 1, 0),
Predition300 = ifelse(dist.km300 > 20, 1, 0))
Related
Please see the attached dput. I would need to transform the dataframe in question to a form that consists of five columns: Area, Group, Seats, Votes (%) and ShapleyShubik. The number of rows per certain area should be dependent on the number of Groups within that Area. I believe this desired end result is somewhat like of what is referenced as 'long format' of data.
structure(list(Area = c("Germany", "France", "Italy", "Spain"
), data = list(structure(list(Group = c("Group1", "Group2 ",
"Group3 ", "Group4 ", "Group5 ", "Group6 ", "Group7 ", "Group8 ",
"Group9 "), Seats = c(2L, 13L, 23L, 9L, 11L, 5L, 18L, 3L, 1L
), NeededQuota = c(43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-9L)), structure(list(Group = c("Group2 ", "Group4 ", "Group6 ",
"Group1", "Group7 ", "Group3 "), Seats = c(5L, 5L, 1L, 6L, 1L,
9L), NeededQuota = c(14L, 14L, 14L, 14L, 14L, 14L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L)), structure(list(
Group = c("Group4 ", "Group7 ", "Group5 ", "Group2 ", "Group8 ",
"Group9 ", "Group1", "Group6 ", "Group3 "), Seats = c(8L,
14L, 2L, 10L, 2L, 3L, 2L, 6L, 28L), NeededQuota = c(38L,
38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L)), structure(list(
Group = c("Group6 ", "Group2 ", "Group7 ", "Group3 ", "Group4 ",
"Group9 ", "Group5 ", "Group10"), Seats = c(10L, 9L, 1L,
3L, 4L, 1L, 2L, 1L), NeededQuota = c(16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L))), models = list(structure(list(Results = structure(c(2,
0.0235294117647059, 0.0261904761904762, 13, 0.152941176470588,
0.142857142857143, 23, 0.270588235294118, 0.3, 9, 0.105882352941176,
0.107142857142857, 11, 0.129411764705882, 0.121428571428571,
5, 0.0588235294117647, 0.0476190476190476, 18, 0.211764705882353,
0.214285714285714, 3, 0.0352941176470588, 0.0333333333333333,
1, 0.0117647058823529, 0.00714285714285714), .Dim = c(3L, 9L), .Dimnames = list(
c("Votes", "Votes (%)", "Shapley-Shubik"), c("Group1", "Group2 ",
"Group3 ", "Group4 ", "Group5 ", "Group6 ", "Group7 ",
"Group8 ", "Group9 "))), Distribution = c(2L, 13L, 23L,
9L, 11L, 5L, 18L, 3L, 1L), function (object, contr, how.many,
...)
{
if (isFALSE(as.logical(Sys.getenv("_R_OPTIONS_STRINGS_AS_FACTORS_"))))
object <- as.factor(object)
if (!nlevels(object))
stop("object not interpretable as a factor")
if (!missing(contr) && is.name(Xcontr <- substitute(contr)))
contr <- switch(as.character(Xcontr), poly = "contr.poly",
helmert = "contr.helmert", sum = "contr.sum", treatment = "contr.treatment",
SAS = "contr.SAS", contr)
if (missing(contr)) {
oc <- getOption("contrasts")
contr <- if (length(oc) < 2L)
if (is.ordered(object))
contr.poly
else contr.treatment
else oc[1 + is.ordered(object)]
}
if (missing(how.many) && missing(...))
contrasts(object) <- contr
else {
if (is.character(contr))
contr <- get(contr, mode = "function")
if (is.function(contr))
contr <- contr(nlevels(object), ...)
contrasts(object, how.many) <- contr
}
object
}, Method = "PowerIndex", Quota = 43L, Names = c("Group1", "Group2 ",
"Group3 ", "Group4 ", "Group5 ", "Group6 ", "Group7 ", "Group8 ",
"Group9 ")), class = "ShapleyShubik"), structure(list(Results = structure(c(5,
0.185185185185185, 0.166666666666667, 5, 0.185185185185185, 0.166666666666667,
1, 0.037037037037037, 0, 6, 0.222222222222222, 0.166666666666667,
1, 0.037037037037037, 0, 9, 0.333333333333333, 0.5), .Dim = c(3L,
6L), .Dimnames = list(c("Votes", "Votes (%)", "Shapley-Shubik"
), c("Group2 ", "Group4 ", "Group6 ", "Group1", "Group7 ",
"Group3 "))), Distribution = c(5L, 5L, 1L, 6L, 1L, 9L), function (object,
contr, how.many, ...)
{
if (isFALSE(as.logical(Sys.getenv("_R_OPTIONS_STRINGS_AS_FACTORS_"))))
object <- as.factor(object)
if (!nlevels(object))
stop("object not interpretable as a factor")
if (!missing(contr) && is.name(Xcontr <- substitute(contr)))
contr <- switch(as.character(Xcontr), poly = "contr.poly",
helmert = "contr.helmert", sum = "contr.sum", treatment = "contr.treatment",
SAS = "contr.SAS", contr)
if (missing(contr)) {
oc <- getOption("contrasts")
contr <- if (length(oc) < 2L)
if (is.ordered(object))
contr.poly
else contr.treatment
else oc[1 + is.ordered(object)]
}
if (missing(how.many) && missing(...))
contrasts(object) <- contr
else {
if (is.character(contr))
contr <- get(contr, mode = "function")
if (is.function(contr))
contr <- contr(nlevels(object), ...)
contrasts(object, how.many) <- contr
}
object
}, Method = "PowerIndex", Quota = 14L, Names = c("Group2 ", "Group4 ",
"Group6 ", "Group1", "Group7 ", "Group3 ")), class = "ShapleyShubik"),
structure(list(Results = structure(c(8, 0.106666666666667,
0.096031746031746, 14, 0.186666666666667, 0.131746031746032,
2, 0.0266666666666667, 0.0198412698412698, 10, 0.133333333333333,
0.131746031746032, 2, 0.0266666666666667, 0.0198412698412698,
3, 0.04, 0.0198412698412698, 2, 0.0266666666666667, 0.0198412698412698,
6, 0.08, 0.0484126984126984, 28, 0.373333333333333, 0.512698412698413
), .Dim = c(3L, 9L), .Dimnames = list(c("Votes", "Votes (%)",
"Shapley-Shubik"), c("Group4 ", "Group7 ", "Group5 ", "Group2 ",
"Group8 ", "Group9 ", "Group1", "Group6 ", "Group3 "))),
Distribution = c(8L, 14L, 2L, 10L, 2L, 3L, 2L, 6L, 28L
), function (object, contr, how.many, ...)
{
if (isFALSE(as.logical(Sys.getenv("_R_OPTIONS_STRINGS_AS_FACTORS_"))))
object <- as.factor(object)
if (!nlevels(object))
stop("object not interpretable as a factor")
if (!missing(contr) && is.name(Xcontr <- substitute(contr)))
contr <- switch(as.character(Xcontr), poly = "contr.poly",
helmert = "contr.helmert", sum = "contr.sum",
treatment = "contr.treatment", SAS = "contr.SAS",
contr)
if (missing(contr)) {
oc <- getOption("contrasts")
contr <- if (length(oc) < 2L)
if (is.ordered(object))
contr.poly
else contr.treatment
else oc[1 + is.ordered(object)]
}
if (missing(how.many) && missing(...))
contrasts(object) <- contr
else {
if (is.character(contr))
contr <- get(contr, mode = "function")
if (is.function(contr))
contr <- contr(nlevels(object), ...)
contrasts(object, how.many) <- contr
}
object
}, Method = "PowerIndex", Quota = 38L, Names = c("Group4 ",
"Group7 ", "Group5 ", "Group2 ", "Group8 ", "Group9 ",
"Group1", "Group6 ", "Group3 ")), class = "ShapleyShubik"),
structure(list(Results = structure(c(10, 0.32258064516129,
0.323809523809524, 9, 0.290322580645161, 0.261904761904762,
1, 0.032258064516129, 0.0285714285714286, 3, 0.0967741935483871,
0.0952380952380952, 4, 0.129032258064516, 0.157142857142857,
1, 0.032258064516129, 0.0285714285714286, 2, 0.0645161290322581,
0.0761904761904762, 1, 0.032258064516129, 0.0285714285714286
), .Dim = c(3L, 8L), .Dimnames = list(c("Votes", "Votes (%)",
"Shapley-Shubik"), c("Group6 ", "Group2 ", "Group7 ", "Group3 ",
"Group4 ", "Group9 ", "Group5 ", "Group10"))), Distribution = c(10L,
9L, 1L, 3L, 4L, 1L, 2L, 1L), function (object, contr, how.many,
...)
{
if (isFALSE(as.logical(Sys.getenv("_R_OPTIONS_STRINGS_AS_FACTORS_"))))
object <- as.factor(object)
if (!nlevels(object))
stop("object not interpretable as a factor")
if (!missing(contr) && is.name(Xcontr <- substitute(contr)))
contr <- switch(as.character(Xcontr), poly = "contr.poly",
helmert = "contr.helmert", sum = "contr.sum",
treatment = "contr.treatment", SAS = "contr.SAS",
contr)
if (missing(contr)) {
oc <- getOption("contrasts")
contr <- if (length(oc) < 2L)
if (is.ordered(object))
contr.poly
else contr.treatment
else oc[1 + is.ordered(object)]
}
if (missing(how.many) && missing(...))
contrasts(object) <- contr
else {
if (is.character(contr))
contr <- get(contr, mode = "function")
if (is.function(contr))
contr <- contr(nlevels(object), ...)
contrasts(object, how.many) <- contr
}
object
}, Method = "PowerIndex", Quota = 16L, Names = c("Group6 ",
"Group2 ", "Group7 ", "Group3 ", "Group4 ", "Group9 ",
"Group5 ", "Group10")), class = "ShapleyShubik"))), row.names = c(NA,
-4L), groups = structure(list(Area = c("France", "Germany", "Italy",
"Spain"), .rows = structure(list(2L, 1L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I initially thought I would have to unnest the structure, and tried
ByArea <- outputdata %>%
group_by(Area) %>%
unnest()
but it produces an error telling that Input must be a vector, not a <ShapleyShubik> object.
EDIT:
The head of the needed output is as such:
Area Group Seats Seats(%) ShapleyShubik
Germany Group1 2 0.25 0.1234
Germany Group2 2 0.25 0.1234
Germany Group3 4 0.50 0.7532
It looks like you're fairly new to SO; welcome to the community! To get the best answers quickly, it's always best to make your question reproducible. You've got the data here, but not the libraries.
Either way, I think I can help. This is using several of the packages called with tidyverse.
library(tidyverse)
showMe <- map_dfr(1:4, # there are four power models in this object
# first capture the country for each group
# create a vector of repeats the length of the model
~cbind(Area = rep(outputdata$Area[[.x]],
times = ncol(outputdata$models[[.x]][["Results"]])),
# now capture the results
t(outputdata$models[[.x]][["Results"]]) %>%
as.data.frame() %>%
# move the groups from row names to a column
mutate(Groups = rownames(.),
.before = 1)))
This is what you would see at this point:
# Area Groups Votes Votes (%) Shapley-Shubik
# Group1...1 Germany Group1 2 0.02352941 0.026190476
# Group2 ...2 Germany Group2 13 0.15294118 0.142857143
# Group3 ...3 Germany Group3 23 0.27058824 0.300000000
# Group4 ...4 Germany Group4 9 0.10588235 0.107142857
# Group5 ...5 Germany Group5 11 0.12941176 0.121428571
# Group6 ...6 Germany Group6 5 0.05882353 0.047619048
# Group7 ...7 Germany Group7 18 0.21176471 0.214285714
# Group8 ...8 Germany Group8 3 0.03529412 0.033333333
Next, remove the row names and trim the whitespace in the Groups field.
# now remove rownames, then remove whitespace from groups
rownames(showMe) <- NULL
showMe$Groups <- trimws(showMe$Groups)
showMe
# Area Groups Votes Votes (%) Shapley-Shubik
# 1 Germany Group1 2 0.02352941 0.026190476
# 2 Germany Group2 13 0.15294118 0.142857143
# 3 Germany Group3 23 0.27058824 0.300000000
# 4 Germany Group4 9 0.10588235 0.107142857
# 5 Germany Group5 11 0.12941176 0.121428571
# 6 Germany Group6 5 0.05882353 0.047619048
# 7 Germany Group7 18 0.21176471 0.214285714
I've created this function below that produces data that will go in a report in a UI.
However its not necessarily doing what I would like it to in the name and age arguments. It prints out the name and age in connection to how many orders there are. So if i.e. Customer ID 59 made 2 orders - her name will be printed out "Jane" "Jane" - I would like it to not do that.
If anyone has any idea on how to change this, i'd appreciate your input.
CustomerReport <- function(ID, Start_Date, End_Date) {
CustomerOrders <- OrdersData[OrdersData$Customer_ID == ID & OrdersData$Date >= Start_Date & OrdersData$Date <= End_Date,]
ProductOrders <- ItemsInOrders[ItemsInOrders$Order_ID %in% CustomerOrders$Order_ID,]
CustomerInfo <- CustomersData[CustomersData$Customer_ID == ID,]
Name <- paste(CustomerInfo$First_Name, CustomerInfo$Last_Name)
Age <- CustomerInfo$Customer_Age
NumberofOrders <- nrow(CustomerOrders)
MeanTotals <- mean(ProductOrders$Quantities)
MedianTotals <- median(ProductOrders$Quantities)
PercentageType <- table(CustomerOrders$Type)/NumberofOrders
PercentageBreakdown <- table(ProductOrders$Products)/nrow(ItemsInOrders)
Result <- list(Name = Name, Age = Age, NumberofOrders = NumberofOrders, MeanTotals = MeanTotals,
MedianTotals = MedianTotals, PercentageType = PercentageType, PercentageBreakdown = PercentageBreakdown
)
return(Result)
}
#Test the Customer Report Funcion
CustomerReport(1251, "2019-01-01", "2019-01-25")
the dput for the data frames
dput(droplevels(CustomersData[1:5, ]))
structure(list(First_Name = c("Ariel", "Kinshasa", "May", "Gabrielle",
"Jennifer"), Last_Name = c("Dirrim", "Purifoy", "Sue", "Finley",
"Towns"), Customer_ID = c(1251L, 290L, 1714L, 381L, 109L), Customer_DOB = structure(c(11181,
3956, 10632, 9742, 11145), class = "Date"), Customer_Age = c(20,
39, 21, 24, 20)), row.names = c(NA, 5L), class = "data.frame")
dput(droplevels(OrdersData[1:5, ]))
structure(list(Order_ID = c(69L, 3025L, 3549L, 27L, 4561L), Customer_ID = c(1251L,
290L, 1714L, 381L, 109L), Date = structure(c(17899, 17921, 17925,
17923, 17917), class = "Date"), Type = structure(c(2L, 1L, 2L,
2L, 2L), .Label = c("Delivery", "Pick Up"), class = "factor"),
Coupon = c("OFF10", NA, "LARGE10", "LARGE10", "LARGE10"),
Delivery_Fee = c("0", "12", "0", "0", "0"), Sub_Total_Before_Discount = c(27.98,
40.9, 74.94, 91.85, 80.82), Discount = c(2.8, 0, 7.49, 9.19,
8.08), Sub_Total_After_Discount = c(25.18, 40.9, 67.45, 82.66,
72.74), GST = c(2.52, 4.09, 6.74, 8.27, 7.27), Total = c(27.7,
44.99, 74.19, 90.93, 80.01)), row.names = c(NA, 5L), class = "data.frame")
dput(droplevels(ItemsInOrders[1:5, ]))
structure(list(Order_ID = c(69L, 3025L, 3025L, 3549L, 3549L),
Products = structure(c(2L, 4L, 1L, 3L, 5L), .Label = c("BBQ Chicken Pizza",
"Meatlovers Pizza", "Seafood Pizza", "Supreme Pizza", "Vegetarian Pizza"
), class = "factor"), Prices = c(13.99, 13.95, 14.95, 13.99,
10.99), Quantities = c(2L, 1L, 1L, 3L, 3L)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
>
Everything else is perfect. Just the Names and the age are coming out in duplicates or triplicates.
Also, while we are here - is it possible to return the Percentage breakdowns as actual % values rather than 0.1 etc?
I guess you are looking for unique. However, the behaviour you are describing isn't reproducible with the data you provided.
Try to replace the two lines getting the name and age information in your CustomerReport function:
Name <- paste(unique(CustomerInfo$First_Name), unique(CustomerInfo$Last_Name))
Age <- unique(CustomerInfo$Customer_Age)
Calculation inside for loop & ifelse is working when I have 100-200 rows but not working when I have 20000 rows.
Can someone help me with the FOR loop and IFELSE if something is wrong or if there is some timeout happening in R studio when running for & if-else loop
Code:
#FROM HERE IT IS NOT WORKING WHEN WE HAVE 20000 ROWS OF DATA IN FINAL DATFRAME.
#WE ARE CREATING FINAL_V1 WHICH IS POPULATING ONLY 1 ROW
#New Dataframe with Null values
Final <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(1:6, .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Week = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "2019-W24", class = "factor"),
SS = c(15L, 7L, 5L, 9L, 2L, 2L), Freq = c(3, 6, 1, 2, 1,
1), agg = c(1, 1, 1, 1, 0, 0)), row.names = c(NA, -6L), class = "data.frame")
lctolc <- structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 2L, 2L, 3L, 3L), .Label = c("MW92",
"OY01", "RM11"), class = "factor"), ToLC = structure(1:5, .Label = c("OY01",
"RM11", "RS11", "WK14", "WK15"), class = "factor")), row.names = c(NA,
-5L), class = "data.frame")
df <- as.data.frame(unique(Final$Item))
Final_v1 <- NA
j <- 1
i <- 1
#SS computations
#For 1 to no of rows in df(which is having no of unique items
for(j in 1:nrow(df)) {
#copying the data from Final to Final_v1(with charater type)
Final_v1 <- Final[Final$Item == as.character(df[j,1]),]
#for 1 to the no of rows in Final_v1
for(i in 1:nrow(Final_v1)) {
if(Final_v1[i,6] <= 0)
{
Final_v1[i,7] = Final_v1[i,4]}
else
{
if(Final_v1[i,5] == '1')
{
Final_v1[i,7]=0
}
else
{
Final_v1[i,7]=Final_v1[i,4]
}
SSNew <- Final_v1[i,7]
#Leftover distribution
LCS <- lctolc$ToLC[Final_v1$Item[i] == lctolc$Item & Final_v1$LC[i] == lctolc$LC]
inds <- Final_v1$LC %in% LCS
if (any(inds))
{ Final_v1$SS[inds]<- if (SSNew == 0) {Final_v1$SS[inds]==0} else {Final_v1$SS[inds]=Final_v1$SS[inds]} }
}
}
names(Final_v1)[7] <- "SSNew"
}
Can someone help why it is not performing for 20000rows
I am working on a tv retail dataset in R and wanted to put steps I will need to use repeatedly into a function.
This includes checking the VIF and return it, run the STEP algorithm to determine the best model and then use the result of the STEP and display it.
The major issue is the error message
Error in eval(predvars, data, env) : object 'Hour' not found
which appears to appear in the step() call.
Regression <- function(data, dep_var, features) {
lin.null = lm(paste(dep_var,'~ 1', sep = ''), data= data)
lin.full = lm(paste(dep_var,'~', paste(features, collapse='+'), sep = ''), data = data)
vif(lin.full)
opt = step(lin.null, scope = list(lower = lin.null, upper = lin.full), direction = "forward")
step_opt = opt$call
stargazer(step_opt, type = 'text')
}
dep_var = 'imp'
feat = c('Hour', 'grp')
paste(dep_var,'~', paste(feat, collapse='+'), sep = '')
Regression(comb_a, 'imp', feat)
The final result should show me the VIF values for each variable and the stargazer output of the STEP optimized regression.
EDIT 1:
comb_a is the input data the regression should take
The dput() output follows down below:
# comb_a
structure(list(Day = structure(c(1483833600, 1483833600, 1483833600,
1483833600, 1483833600, 1483833600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Hour = c(0, 1, 6, 7, 8, 9), Model = c("Model A",
"Model A", "Model A", "Model A", "Model A", "Model A"), tv_count = c(5L,
8L, 4L, 9L, 11L, 8L), grp_abs = c(55500, 8308, 19026, 12184,
10141, 113225), grp = c(0.22, 0.03, 0.07, 0.05, 0.04, 0.45),
sum_duration = c(150, 240, 120, 270, 330, 240), grp_per_second = c(370,
34.6166666666667, 158.55, 45.1259259259259, 30.730303030303,
471.770833333333), hours_since = c(NA, 1, 5, 1, 1, 1), camp_count = c(2L,
2L, 2L, 2L, 3L, 4L), imp = c(528, 319, 97, 182, 327, 785),
clicks = c(28, 15, 6, 13, 29, 53), leads = c(0, 0, 0, 0,
0, 1)), .Names = c("Day", "Hour", "Model", "tv_count", "grp_abs",
"grp", "sum_duration", "grp_per_second", "hours_since", "camp_count",
"imp", "clicks", "leads"), row.names = c(NA, -6L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = c("Day", "Hour"), drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L, 5L), group_sizes = c(1L, 1L, 1L, 1L,
1L, 1L), biggest_group_size = 1L, labels = structure(list(Day = structure(c(1483833600,
1483833600, 1483833600, 1483833600, 1483833600, 1483833600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), Hour = c(0, 1, 6, 7, 8, 9)), row.names = c(NA,
-6L), class = "data.frame", vars = c("Day", "Hour"), drop = TRUE, .Names = c("Day",
"Hour")))
desired output would be: (Numbers are just for representation)
> vif(lin.full)
Hour grp sum_duration grp_per_second hours_since camp_count
2.979362 4.981504 2.290328 3.279818 1.013725 1.110823
imp clicks
7.471457 9.244811
> stargazer(step_opt, type = 'text')
===============================================
Dependent variable:
---------------------------
leads
-----------------------------------------------
clicks 0.005***
(0.0004)
camp_count 0.040*
(0.024)
Constant -0.107
(0.098)
-----------------------------------------------
Observations 898
R2 0.181
Adjusted R2 0.179
Residual Std. Error 0.772 (df = 895)
F Statistic 98.901*** (df = 2; 895)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
I am trying to convert some code with commands from the apply() family into a for loop but when executing the script I get an error when converting to as.numeric(): (list) object cannot be coerced to type 'double'.
The reason why I want to use a for loop is because of the implementation of a progress bar by using txtProgressBar which to my understanding cannot be used in an apply/lapply command. I've found the pbapply package but as I am calling multiple scripts with the source() command which all run with txtProgressBar as progress indicator, I'd like to use it for consistency reasons.
Here is the code I tried to convert into a for loop.
xy <- structure(list(NAME = structure(c(2L, 2L, 1L, 1L), .Label = c("CISCO", "JOHN"), class = "factor"), ID = c(41L, 41L, 57L, 57L), X_START_YEAR = c(1965L, 1932L, 1998L, 1956L), Y_START_VALUE = c(960L, -45L, 22L, -570L), X_END_YEAR = c(1968L, 1955L, 2002L, 1970L), Y_END_VALUE = c(960L, -45L, 22L, -570L), LC = structure(c(1L, 1L, 2L, 2L), .Label = c("CA", "US"), class = "factor")), .Names = c("NAME", "ID", "X_START_YEAR","Y_START_VALUE", "X_END_YEAR", "Y_END_VALUE", "LC"), class = "data.frame", row.names = c(NA,-4L))
ind <- split(xy,xy$ID)
# Progress bar
pb = txtProgressBar(min = 0, max = length(ind), initial = 0,title=Test, style=3)
progress <- 1
# Plots
for (i in ind){
xx = unlist(i[, c(3, 5)])
yy = unlist(i[, c(4, 6)])
fname <- paste0(i[1, 'ID'],'.png')
png(fname, width=1679, height=1165, res=150)
par(mar=c(6,8,6,5))
plot(xx,yy,type='n',main=unique(i[,1]), xlab="Time [Years]", ylab="Value [mm]",ylim = range(c(yy,-.5,.5)))
i <- i[,-1]
rect(i[3], min(i[4], 0), i[5], max(i[4], 0), col=if(as.numeric(i[4]) < 0) 'red' else 'blue')
abline(h=0, col = "gray60")
progress = progress + 1
setTxtProgressBar(pb,progress)
dev.off()
}
Here is the original code which works by using lapply and apply functions, however txtProgressBar couldn't be implemented.
xy <- structure(list(NAME = structure(c(2L, 2L, 1L, 1L), .Label = c("CISCO", "JOHN"), class = "factor"), ID = c(41L, 41L, 57L, 57L), X_START_YEAR = c(1965L, 1932L, 1998L, 1956L), Y_START_VALUE = c(960L, -45L, 22L, -570L), X_END_YEAR = c(1968L, 1955L, 2002L, 1970L), Y_END_VALUE = c(960L, -45L, 22L, -570L), LC = structure(c(1L, 1L, 2L, 2L), .Label = c("CA", "US"), class = "factor")), .Names = c("NAME", "ID", "X_START_YEAR","Y_START_VALUE", "X_END_YEAR", "Y_END_VALUE", "LC"), class = "data.frame", row.names = c(NA,-4L))
ind <- split(xy,xy$ID)
lapply(ind, function(x) {
plot(unlist(x[, c(3, 5)]), unlist(x[, c(4, 6)]), type='n',
xlab='Time [Years]', ylab='Value [mm]', main=x[1, 1])
apply(x, 1, function(y) {
rect(y[3], min(y[4], 0), y[5], max(y[4], 0),
col=if(as.numeric(y[4]) < 0) 'red' else 'blue')
abline(h=0)
})
})
My question: Does anyone see how to suppress the conversion error mentioned above and complete the for loop in order to include the progress bar with txtProgressBar()?
This at least runs and makes plots.
as.numeric in the code below is expecting a single element or a vector, but i[4] is a list, so you need to i[, 4] it
xy <- structure(list(NAME = structure(c(2L, 2L, 1L, 1L), .Label = c("CISCO", "JOHN"), class = "factor"), ID = c(41L, 41L, 57L, 57L), X_START_YEAR = c(1965L, 1932L, 1998L, 1956L), Y_START_VALUE = c(960L, -45L, 22L, -570L), X_END_YEAR = c(1968L, 1955L, 2002L, 1970L), Y_END_VALUE = c(960L, -45L, 22L, -570L), LC = structure(c(1L, 1L, 2L, 2L), .Label = c("CA", "US"), class = "factor")), .Names = c("NAME", "ID", "X_START_YEAR","Y_START_VALUE", "X_END_YEAR", "Y_END_VALUE", "LC"), class = "data.frame", row.names = c(NA,-4L))
ind <- split(xy,xy$ID)
# Progress bar
pb = txtProgressBar(min = 0, max = length(ind), initial = 0,title=Test, style=3)
progress <- 1
# Plots
## changed these next two lines
for (i in seq_along(ind)){
i <- ind[[i]]
xx = unlist(i[, c(3, 5)])
yy = unlist(i[, c(4, 6)])
fname <- paste0(i[1, 'ID'],'.png')
# png(fname, width=1679, height=1165, res=150)
par(mar=c(6,8,6,5))
plot(xx,yy,type='n',main=unique(i[,1]), xlab="Time [Years]", ylab="Value [mm]",ylim = range(c(yy,-.5,.5)))
# i <- i[,-1]
apply(i, 1, function(y)
rect(y[3], min(y[4], 0), y[5], max(y[4], 0),
col=if(as.numeric(y[4]) < 0) 'red' else 'blue'))
abline(h=0, col = "gray60")
progress = progress + 1
setTxtProgressBar(pb,progress)
# dev.off()
}
# |==========================================================================| 100%