Calculate and output the date of customer's first order - r

Data:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","1.1.12","1.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
Expected outcome:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","2.1.12","3.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99),
DateOfFirstOrderofCustomer = c("1.1.12", "2.1.12", "3.1.12", "1.1.12", "1.1.12", "3.1.12", "2.1.12", "2.1.12", "1.1.12", "1.1.12"))
For Understanding:
The orderID is continuous. Products orderd from the same customerID at the same day get the same orderID. When the same customer orders products at another day he/she it´s a new orderID.
I want to add an additional column for every row/entry which contains the date of the customer's first order (e.g. customer 1 (customerID 1) made his first order on 1.1.12 so this date is entered in all orders from this customer). How can we do this?
The original data has about 500k rows: so plz give a solution which needs only little perfomance.

With just base R functions:
# convert the date column to date-format
DB$orderDate <- as.Date(DB$orderDate, format('%d.%m.%y'))
# get the first date for each customer
DB$DateFirstOrder <- with(DB, ave(orderDate, customerID, FUN = min))
the result is then (using the data of Mike Spencer):
> DB
orderID orderDate itemID customerID itemPrice DateFirstOrder
1 1 2012-01-01 2 1 9.99 2012-01-01
2 2 2012-01-04 3 2 14.99 2012-01-04
3 3 2012-01-06 2 3 9.99 2012-01-06
4 4 2012-01-13 5 1 19.99 2012-01-01
5 4 2012-01-13 12 1 29.99 2012-01-01
6 5 2012-01-12 4 3 4.99 2012-01-06
7 6 2012-01-10 2 2 9.99 2012-01-04
8 6 2012-01-10 3 2 14.99 2012-01-04
9 7 2012-01-21 1 1 49.99 2012-01-01
10 8 2012-01-24 5 1 19.99 2012-01-01
For the fastest solution, I would recommend the data.table package. To get the desired result with this package, you need to do:
library(data.table)
setDT(DB)[, orderDate := as.Date(orderDate, format('%d.%m.%y'))
][, DateFirstOrder := min(orderDate), by = customerID]

I've changed your input data as those you provided all had the same date of first purchase from each customer, so it was impossible to tell if the code worked. The example uses dplyr, you could also use tapply, but you'd need to reformat the named vector.
# Dummy data
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
# -------------------------------------------
# Change dates to a readable format
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
# -------------------------------------------
library(dplyr)
DB <- DB %>%
group_by(customerID) %>%
mutate(DateOfFirstOrderofCustomer=min(orderDate))

I am using plyr package. rest everything is same.
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
install.packages("plyr")
library(plyr)
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
DB = ddply(DB, .(customerID), mutate, DateOfFirstOrderofCustomer = min(orderDate))

Related

Extract columns from data frames in a list in a separate list of data frames

I have a list -cj1- with multiple data frames
dput(head(cj1[1:2]))
list(structure(list(individual = c("a12TTT.pdf", "a15.pdf", "a17.pdf",
"a18.pdf", "a21.pdf", "a2TTT.pdf", "a5.pdf", "B11.pdf", "B12.pdf",
"B13.pdf", "B22.pdf", "B24.pdf", "B4.pdf", "B7.pdf", "B8.pdf",
"cw10-1.pdf", "cw13-1.pdf", "cw15-1TTT.pdf", "cw17-1.pdf", "cw18.pdf",
"cw3.pdf", "cw4.pdf", "cw7_1TTT.pdf"), id = 1:23, Ntot = c(13,
9, 16, 15, 9, 13, 10, 10, 11, 10, 14, 10, 11, 12, 11, 10, 15,
12, 14, 11, 9, 10, 11), N1 = c(5, 5, 10, 11, 7, 9, 5, 5, 6, 8,
8, 8, 9, 8, 7, 1, 0, 6, 3, 4, 2, 4, 2), ND = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), N0 = c(8,
4, 6, 4, 2, 4, 5, 5, 5, 2, 6, 2, 2, 4, 4, 9, 15, 6, 11, 7, 7,
6, 9), score = c(5.06923076923077, 4.96666666666667, 9.925, 10.86,
6.83333333333333, 8.88461538461539, 5, 5, 5.97272727272727, 7.82,
7.95714285714286, 7.82, 8.80909090909091, 7.9, 6.91818181818182,
1.24, 0.3, 6, 3.17142857142857, 4.08181818181818, 2.16666666666667,
4.06, 2.19090909090909), propscore = c(0.389940828402367, 0.551851851851852,
0.6203125, 0.724, 0.759259259259259, 0.683431952662722, 0.5,
0.5, 0.54297520661157, 0.782, 0.568367346938776, 0.782, 0.800826446280992,
0.658333333333333, 0.628925619834711, 0.124, 0.02, 0.5, 0.226530612244898,
0.371074380165289, 0.240740740740741, 0.406, 0.199173553719008
), theta = c(-0.571211122446447, 0.418736780198501, 0.464533662219296,
0.760432013134893, 1.43961032059382, 0.935963883364303, 0.0742361005467161,
0.416783201347136, 0.232586422933618, 1.65345248955369, 0.178947462869717,
1.3980442736112, 1.5300599487058, 0.340087410746963, 0.616985944469495,
-1.73246102772711, -4.06186172096556, -0.347700710331151, -1.21009964741398,
0.239145600406579, -1.88836418690337, -0.276451472526056, -0.611455626388059
), se.theta = c(0.689550115014498, 0.689441554709003, 0.595659709892116,
0.609506508256404, 0.917792293663691, 0.652011367164736, 0.720534163064516,
0.695969555549033, 0.661019531367007, 0.87050969318314, 0.605775647419845,
0.797443937820774, 0.768436114096332, 0.695748274310803, 0.709380679025605,
1.00089414765463, 1.8701468050665, 0.68959824350285, 0.733014089189809,
0.656392513303483, 0.952935324276941, 0.71608982789968, 0.771906532861938
), outfit = c(1.24922700170817, 1.46067763769417, 0.915183304626819,
0.753992664091072, 0.37410361433915, 0.727316037037668, 0.616907868814702,
1.01528298230254, 1.01594232662062, 0.616808170683195, 0.646097057961938,
0.622993494551005, 0.807441271101246, 0.788526018181888, 1.2157399735092,
0.341189086206191, 0.021052091633073, 0.543024513106335, 1.04183076617928,
1.1772656963046, 0.736106160865241, 0.756316095787985, 0.58320701094964
), infit = c(1.4078580948461, 1.42854494963967, 1.09762978932861,
0.893957122448352, 0.64936943769433, 0.899191443180872, 0.724956556509282,
1.14975990693782, 1.08074439712469, 0.978248081241133, 0.755557633771936,
0.823903684368671, 0.911855771375284, 0.954272320131035, 0.926253596526142,
0.634052701587448, 0.0504659822408584, 0.712539957033173, 0.966034039620798,
1.1901663169553, 0.81371119642719, 0.817417869881874, 0.737574872116582
)), row.names = c(NA, -23L), class = "data.frame"), structure(list(
parlabel = c("Ties", "Home"), par = c("delta", "eta"), est = c(-43.5016417611571,
0.337872999554289), se = c(366043197.615422, 0.215169736220537
)), row.names = c(NA, -2L), class = "data.frame"))
Here is how data frames look:
head(cj1[[1]],2)
individual id Ntot N1 ND N0 score propscore theta se.theta outfit
1 a12TTT.pdf 1 13 5 0 8 5.069231 0.3899408 -0.5712111 0.6895501 1.249227
2 a15.pdf 2 9 5 0 4 4.966667 0.5518519 0.4187368 0.6894416 1.460678
infit
1 1.407858
2 1.428545
I would like to create a separate list -results1- that would contain data frames that would include columns 1 and 9 named individual and theta
I tried:
results1<-sapply(cj1, "[",c("individual","theta") )
Error in [.data.frame(X[[i]], ...) : undefined columns selected
library(dplyr)
> results1 <- lapply(cj1, function(x) x%>% select(individual,theta))
Error:
Can't subset columns that don't exist.
x Column individual doesn't exist.
Run rlang::last_error() to see where the error occurred.
I can subtract these columns from one data frame:
cj[[1]][c(1,9)]
I could not apply this to the whole list.
You can use the following solution. We use .x to refer to every individual element of your list. Here .x can be each of your data frames of which we would like to select only 2 columns c("individual","theta").
However, since only one of your data frames contains such column names I used keep function to actually keep only elements whose data frames contain the desired column name. Just bear in mind for this form of coding which is called purrr-style formula we need ~ before .x. So you use map function which is an equivalent to lapply from base R and use this syntax to apply whatever function on every individual elements (data frames here).
library(purrr)
cj1 %>%
map_if(~ all(c("individual","theta") %in% names(.x)),
~ .x %>% select(individual, theta)) %>%
keep(~ all(c("individual","theta") %in% names(.x)))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Or we can spare a line of code to be more concise:
cj1 %>%
keep(~ all(c("individual","theta") %in% names(.x))) %>%
map(~ .x %>% select(individual, theta))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Here is just another base R solution with a slightly different syntax. Just note that \(x) is equivalent to function(x) which is a new feature available as of R. 4.1.0.
cj1 |>
lapply(\(x) {
if(all(c("individual","theta") %in% names(x))) {
`[`(x, c("individual","theta"))
}
}
) -> cj2
cj2 <- cj2[-which(sapply(cj2, is.null))] |> as.data.frame()
In base R, you can try this solution with lapply -
cols <- c("individual","theta")
lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols])
#[[1]]
# individual theta
#1 a12TTT.pdf -0.5712
#2 a15.pdf 0.4187
#3 a17.pdf 0.4645
#4 a18.pdf 0.7604
#5 a21.pdf 1.4396
#6 a2TTT.pdf 0.9360
#7 a5.pdf 0.0742
#8 B11.pdf 0.4168
#9 B12.pdf 0.2326
#10 B13.pdf 1.6535
#11 B22.pdf 0.1789
#12 B24.pdf 1.3980
#13 B4.pdf 1.5301
#14 B7.pdf 0.3401
#15 B8.pdf 0.6170
#16 cw10-1.pdf -1.7325
#17 cw13-1.pdf -4.0619
#18 cw15-1TTT.pdf -0.3477
#19 cw17-1.pdf -1.2101
#20 cw18.pdf 0.2391
#21 cw3.pdf -1.8884
#22 cw4.pdf -0.2765
#23 cw7_1TTT.pdf -0.6115
#[[2]]
#NULL
If you want to drop the NULL lists you can add Filter -
Filter(length, lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols]))

How to group by with if statement in R?

I have data containing four variables (id, quantity, weight, date) and i want to make packages of quantity=6 using just observations with quantity below 6, example : if i have 6 products of quantity 1 each,
i want in return just 1 product of quantity 6 where i sum(weight[which(qte)<6])
And do it for all elements of the table
I've tried this code but it's not working, can anyone help me please ?
poids = c()
qte =c()
dd = data.frame()
for (i in length(paquet)){
if(paquet$RealQuantity[i]+paquet$RealQuantity[i+1]==6){
poids[i] = sum(paquet$RealWeigth)
qte[i] = sum(paquet$RealQuantity)
dd = rbind(dd,data.frame(i=i,poids = poids[i],qte =qte[i]))
}
}
this is an example :
A tibble: 232 x 4
ProductID RealQuantity RealWeigth PickingDate
<dbl> <dbl> <dbl> <date>
1 1 5 0.296 2017-12-26
2 1 1 0.064 2018-01-05
3 1 1 0.061 2018-01-05
4 1 5 0.297 2018-01-10
5 1 5 0.298 2018-01-13
6 1 1 0.058 2018-01-16
7 1 3 0.172 2018-01-23
8 1 3 0.172 2018-01-23
the output expected is :
ProductID RealQuantity RealWeigth PickingDate
<dbl> <dbl> <dbl> <date>
1 1 6 0.36 2017-12-26
2 1 6 0.358 2018-01-05
3 1 6 0.356 2018-01-13
4 1 6 0.344 2018-01-23
This is an iterative solution that cannot guarantee everything is assigned a package of 6 items:
df <- structure(list(ProductID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RealQuantity = c(5, 1, 1, 5, 5, 1, 3, 3, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3), RealWeigth = c(0.296, 0.064, 0.061, 0.297, 0.298, 0.058, 0.172, 0.172, 0.177, 0.1695, 0.179, 0.18, 0.175, 0.301, 0.181, 0.178, 0.161, 0.178, 0.1775, 0.183), PickingDate = structure(c(17526, 17536, 17536, 17541, 17544, 17547, 17554, 17554, 17554, 17555, 17556, 17556, 17557, 17557, 17557, 17557, 17558, 17561, 17562, 17562), class = "Date")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
# for portion of data.frame with RealQuantity < 6:
df$group <- NA
df <- df[order(df$RealQuantity, decreasing=TRUE),]
gi <- 0
groupsize <- 0
while (TRUE) {
gi <- gi + 1
# find biggest, non-assigned row:
i <- min(which(is.na(df$group)))
if (is.infinite(i)) # nothing found; which would return a zero-length vector. But min(integer(0)) returns Inf.
break
groupsize <- df$RealQuantity[i]
df$group[i] <- gi
# find next line whos combined quantity might fit within "6":
while (TRUE) {
j <- min(which(is.na(df$group) & df$RealQuantity + groupsize <= 6))
if (is.infinite(j)) # nothing found
break
groupsize <- groupsize + df$RealQuantity[j]
df$group[j] <- gi
if (groupsize >= 6)
break
}
}
library(dplyr)
df %>% group_by(group) %>% summarise_at(vars(RealQuantity, RealWeigth), funs(sum))
or some more details:
df %>% group_by(group) %>% summarise(combined=paste(RealQuantity, collapse=', '), RealQuantity=sum(RealQuantity), RealWeigth=sum(RealWeigth), Firstdate=min(PickingDate))

How can I make conditional selections using dplyr in R?

I have the following situation. Given the table
df <- data.frame(ID = c(1, 2, 2, 3, 3, 4),
type = c("MC", "MC", "MK", "MC", "MK", "MC"),
value1 = c(512, 261, 4523, 1004, 1221, 2556),
value2 = c(726, 4000, 280, 998, 113, 6789))
I am trying to find a way to implement the following logic: If for an ID, both types (MC and MK) occur, use value1 from MK and value2 from MC. Otherwise (only the type MC occurs), use MC.
Hence, the final result is supposed to be:
data.frame(ID = c(1, 2, 3, 4),
type = c("MC", "MC", "MC", "MC"),
value1 = c(512, 4523, 1221, 2556),
value2 = c(726, 4000, 998, 6789))
Assuming the type MK is dropped after extracting the value1.
Another version with dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(value1 = ifelse(any(type == "MK"), value1[type=="MK"],value1[type=="MC"]),
value2 = value2[type == "MC"]) %>%
filter(type == "MC")
# ID type value1 value2
# <dbl> <fct> <dbl> <dbl>
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789
Here, for value1 we check value in "MK" if it is present or take corresponding "MC" value instead and for value2 by default we take "MC" value and keep only rows with type "MC". This is assuming every group (ID) would have a "MC" type row.
For efficiency I would definitely prefer #Andre Elrico' answer but here is a dplyr option. Try:
df <- data.frame(ID = c(1, 2, 2, 3, 3, 4),
type = c("MC", "MC", "MK", "MC", "MK", "MC"),
value1 = c(512, 261, 4523, 1004, 1221, 2556),
value2 = c(726, 4000, 280, 998, 113, 6789))
library(dplyr)
df %>%
reshape(., idvar = "ID", timevar = "type", direction = "wide") %>%
group_by(ID) %>%
mutate(value1 = ifelse(is.na(value1.MK), value1.MC, value1.MK),
value2 = ifelse(is.na(value2.MC), value2.MK, value2.MC),
type = "MC") %>%
select(ID, type, value1, value2)
# output
# A tibble: 4 x 4
# Groups: ID [4]
ID type value1 value2
<dbl> <chr> <dbl> <dbl>
1 1 MC 512 726
2 2 MC 4523 4000
3 3 MC 1221 998
4 4 MC 2556 6789
data.table solution
setDT(df1)[,{x=.SD;if(all(c("MC","MK") %in% type)){x$value1[] = last(value1)};first(x)},by=ID]
result:
# ID type value1 value2
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789
dplyr:
df1 %>% group_by(ID) %>% do(.,(function(x){if(all(c("MC","MK") %in% x$type)){x$value1[] = x$value1[x$type=="MK"]};x[1,]})(.))
# A tibble: 4 x 4
# Groups: ID [4]
# ID type value1 value2
# <dbl> <fct> <dbl> <dbl>
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789

Combining datapoints using an index dataframe in R

I have two dataframes, and I'd like to use one as reference for combining observations in the other one.
First, I have data:
> data
upc fips_state_code mymonth price units year sales
1 1153801013 2 3 25.84620 235 2008 6073.8563
2 1153801013 1 2 28.61981 108 2009 3090.9396
3 1153801013 2 2 27.99000 7 2009 195.9300
4 1153801013 1 1 27.99000 4 2009 111.9600
5 1153801013 1 3 27.99000 7 2008 195.9300
6 72105922753 1 3 27.10816 163 2008 4418.6306
7 72105922765 2 2 24.79000 3 2010 74.3700
8 72105922765 2 2 25.99000 1 2009 25.9900
9 72105922765 1 2 23.58091 13 2009 306.5518
10 1071917100 2 2 300.07000 1 2009 300.0700
11 1071917100 1 3 307.07000 2 2008 614.1400
12 1071917100 2 3 269.99000 1 2010 269.9900
13 1461503541 2 2 0.65200 8 2008 5.2160
14 1461503541 2 2 13.99000 11 2010 153.8900
15 1461503541 1 1 0.87000 1 2008 0.8700
16 11111111 1 1 3.00000 2 2008 6.0000
17 11111112 1 1 6.00000 5 2008 30.0000
Then, I have z, which is the reference:
> z
upc code
3 1153801013 52161
1932 72105922753 52161
1934 72105922765 52161
2027 81153801013 52161
2033 81153801041 52161
2 1071917100 50174
1256 8723610700 50174
I want to combine datapoints in data whose upc is the same in z.
In the sample I gave to you, there are 7 different UPC's.
1071917100 is also in z, with the code 50174. However, the only other upc with this code is 8723610700, which is not in data. Therefore, it remains unchanged.
1461503541, 11111111, and 11111112 are not in z at all, so therefore they also remains unchanged.
1153801013, 72105922753, and 72105922765 all share the same code in z, 52161. Therefore, I want to combine all the observations with these upc's.
I want to do this in a really specific way:
First, I want to choose the UPC with the greatest amount of sales across the data. 1153801013 has 9668.616 in sales (simply the sum of all sales with that upc). 72105922753 has 4418.631 in sales. 72105922765 has 406.9118 in sales. Therefore, I choose 1153801013 as the upc for all of them.
Now having chosen this upc, I want to change 72105922753 and 72105922765 to 1153801013 in data.
Now we have a dataset that looks like this:
> data1
upc fips_state_code mymonth price units year sales
1 1153801013 2 3 25.84620 235 2008 6073.8563
2 1153801013 1 2 28.61981 108 2009 3090.9396
3 1153801013 2 2 27.99000 7 2009 195.9300
4 1153801013 1 1 27.99000 4 2009 111.9600
5 1153801013 1 3 27.99000 7 2008 195.9300
6 1153801013 1 3 27.10816 163 2008 4418.6306
7 1153801013 2 2 24.79000 3 2010 74.3700
8 1153801013 2 2 25.99000 1 2009 25.9900
9 1153801013 1 2 23.58091 13 2009 306.5518
10 1071917100 2 2 300.07000 1 2009 300.0700
11 1071917100 1 3 307.07000 2 2008 614.1400
12 1071917100 2 3 269.99000 1 2010 269.9900
13 1461503541 2 2 0.65200 8 2008 5.2160
14 1461503541 2 2 13.99000 11 2010 153.8900
15 1461503541 1 1 0.87000 1 2008 0.8700
16 11111111 1 1 3.00000 2 2008 6.0000
17 11111112 1 1 6.00000 5 2008 30.0000
Finally, I want to combine all the datapoints with the same year, mymonth, and fips_state_code. The way this will happen is by adding up the sales and unit numbers of datapoints with the same upc, fips_state_code, mymonth, and year, and then recalculating the weighted price. (I.e., price = total Sales / total Units.)
And so, the final data set should look like this:
> data2
upc fips_state_code mymonth price units year sales
1 1153801013 2 3 25.84620 235 2008 6073.856
2 1153801013 1 2 28.07844 121 2009 3397.491
3 1153801013 2 2 27.74000 8 2009 221.920
4 1153801013 1 1 27.99000 4 2009 111.960
5 1153801013 1 3 27.14448 170 2008 4614.561
6 1153801013 2 2 24.79000 3 2010 74.370
7 1071917100 2 2 300.07000 1 2009 300.070
8 1071917100 1 3 307.07000 2 2008 614.140
9 1071917100 2 3 269.99000 1 2010 269.990
10 1461503541 2 2 0.65200 8 2008 5.216
11 1461503541 2 2 13.99000 11 2010 153.890
12 1461503541 1 1 0.87000 1 2008 0.870
13 11111111 1 1 3.00000 2 2008 6.000
14 11111112 1 1 6.00000 5 2008 30.000
I did try to do this myself, but it seems like it could be done more efficiently than my code using dplyr, and I couldn't accomplish the last part successfully. Please let me know if anything is unclear, and thank you very much in advance.
Here is the dput code:
data<-structure(list(upc = c(1153801013, 1153801013, 1153801013, 1153801013,
1153801013, 72105922753, 72105922765, 72105922765, 72105922765,
1071917100, 1071917100, 1071917100, 1461503541, 1461503541, 1461503541,
11111111, 11111112), fips_state_code = c(2, 1, 2, 1, 1, 1, 2,
2, 1, 2, 1, 2, 2, 2, 1, 1, 1), mymonth = c(3, 2, 2, 1, 3, 3,
2, 2, 2, 2, 3, 3, 2, 2, 1, 1, 1), price = c(25.8461971831, 28.6198113208,
27.99, 27.99, 27.99, 27.1081632653, 24.79, 25.99, 23.5809090909,
300.07, 307.07, 269.99, 0.652, 13.99, 0.87, 3, 6), units = c(235,
108, 7, 4, 7, 163, 3, 1, 13, 1, 2, 1, 8, 11, 1, 2, 5), year = c(2008,
2009, 2009, 2009, 2008, 2008, 2010, 2009, 2009, 2009, 2008, 2010,
2008, 2010, 2008, 2008, 2008), sales = c(6073.8563380285, 3090.9396226464,
195.93, 111.96, 195.93, 4418.6306122439, 74.37, 25.99, 306.5518181817,
300.07, 614.14, 269.99, 5.216, 153.89, 0.87, 6, 30)), .Names = c("upc",
"fips_state_code", "mymonth", "price", "units", "year", "sales"
), row.names = c(NA, 17L), class = c("tbl_df", "data.frame"))
z<-structure(list(upc = c(1153801013, 72105922753, 72105922765,
81153801013, 81153801041, 1071917100, 8723610700), code = c(52161L,
52161L, 52161L, 52161L, 52161L, 50174L, 50174L)), .Names = c("upc",
"code"), row.names = c(3L, 1932L, 1934L, 2027L, 2033L, 2L, 1256L
), class = "data.frame")
data1<-structure(list(upc = c(1153801013, 1153801013, 1153801013, 1153801013,
1153801013, 1153801013, 1153801013, 1153801013, 1153801013, 1071917100,
1071917100, 1071917100, 1461503541, 1461503541, 1461503541, 11111111,
11111112), fips_state_code = c(2, 1, 2, 1, 1, 1, 2, 2, 1, 2,
1, 2, 2, 2, 1, 1, 1), mymonth = c(3, 2, 2, 1, 3, 3, 2, 2, 2,
2, 3, 3, 2, 2, 1, 1, 1), price = c(25.8461971831, 28.6198113208,
27.99, 27.99, 27.99, 27.1081632653, 24.79, 25.99, 23.5809090909,
300.07, 307.07, 269.99, 0.652, 13.99, 0.87, 3, 6), units = c(235,
108, 7, 4, 7, 163, 3, 1, 13, 1, 2, 1, 8, 11, 1, 2, 5), year = c(2008,
2009, 2009, 2009, 2008, 2008, 2010, 2009, 2009, 2009, 2008, 2010,
2008, 2010, 2008, 2008, 2008), sales = c(6073.8563380285, 3090.9396226464,
195.93, 111.96, 195.93, 4418.6306122439, 74.37, 25.99, 306.5518181817,
300.07, 614.14, 269.99, 5.216, 153.89, 0.87, 6, 30)), .Names = c("upc",
"fips_state_code", "mymonth", "price", "units", "year", "sales"
), row.names = c(NA, 17L), class = c("tbl_df", "data.frame"))
data2<-structure(list(upc = c(1153801013, 1153801013, 1153801013, 1153801013,
1153801013, 1153801013, 1071917100, 1071917100, 1071917100, 1461503541,
1461503541, 1461503541, 11111111, 11111112), fips_state_code = c(2,
1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1), mymonth = c(3, 2, 2,
1, 3, 2, 2, 3, 3, 2, 2, 1, 1, 1), price = c(25.8461971831, 28.07844,
27.74, 27.99, 27.14448, 24.79, 300.07, 307.07, 269.99, 0.652,
13.99, 0.87, 3, 6), units = c(235, 121, 8, 4, 170, 3, 1, 2, 1,
8, 11, 1, 2, 5), year = c(2008, 2009, 2009, 2009, 2008, 2010,
2009, 2008, 2010, 2008, 2010, 2008, 2008, 2008), sales = c(6073.8563380285,
3397.491, 221.92, 111.96, 4614.561, 74.37, 300.07, 614.14, 269.99,
5.216, 153.89, 0.87, 6, 30)), .Names = c("upc", "fips_state_code",
"mymonth", "price", "units", "year", "sales"), row.names = c(NA,
14L), class = c("tbl_df", "data.frame"))
This is what I have attempted so far:
w <- z[match(unique(z$code), z$code),]
w <- plyr::rename(w,c("upc"="upc1"))
data <- merge(x=data,y=z,by="upc",all.x=T,all.y=F)
data <- merge(x=data,y=w,by="code",all.x=T,all.y=F)
data <- within(data, upc2 <- ifelse(!is.na(upc1),upc1,upc))
data$upc <- data$upc2
data$upc1 <- data$upc2 <- data$code <- NULL
data <- data[complete.cases(data),]
attach(data)
data <- aggregate(data,by=list(upc,fips_state_code,year,mymonth),FUN=sum)
data$price <- data$sales / data$units
detach(data)
data$Group.1 <- data$Group.2 <- data$Group.3 <- data$Group.4 <- NULL
I can't figure out how to make the upc chosen be the one with the most sales. It would also be great if there were a way to do this in fewer lines of code and more elegantly.

Apply custom function to specific row/column

I'm trying to solve a much larger problem using this basic example. I need to apply a function based on the location from which() because I need to know the year from df1 where the value is NA or >= 150. Then I subset df2, get the mean, and return it to the exact row. Right now I'm using a for() loop and need something much faster as the data I have is very large. Is there a common way to do this?
dput:
df1 <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, NA, 37.94, 10.94,
NA, 28.04, 64.94, 41, 200, 51.08)), .Names = c("id", "element",
"year", "month", "day", "value"), row.names = c(NA, -10L), class = c("tbl_df",
"data.frame"))
df2 <-structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, 10.94, 37.94, 10.94,
12, 28.04, 64.94, 41, 82.04, 51.08)), row.names = c(NA, -10L
), class = c("tbl_df", "data.frame"), .Names = c("id", "element",
"year", "month", "day", "value"))
Code:
library(dplyr)
check <- function(df, yr){
df_d <- filter(df, year == yr)
m <- mean(df_d$value)
return(m)
}
for (i in which(is.na(df1$value) | df1$value >= 150)){
df1[i,6] <- check(df = df2, yr = as.numeric(df1[i,3]) )
}
I would recommend the efficient binary join from data.table combined with modification in place (using the :=) while specifying by = .EACHI (in order to calculate the mean for each group separately).
library(data.table)
setDT(df1)[setDT(df2),
value := ifelse(is.na(value) | value >= 150, mean(i.value), value),
on = "year",
by = .EACHI]
df1
# id element year month day value
# 1: USC00031632 TMAX 1900 1 1 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08
Alternatively, we could do this in two steps in order to try avoiding the ifelse overhead in each step
setDT(df1)[setDT(df2), value2 := i.value, on = "year"]
df1[is.na(value) | value >= 150, value := mean(value2), by = year]
df1
# id element year month day value value2
# 1: USC00031632 TMAX 1900 1 1 30.02 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08 51.08
You can get rid of value2 afterwards if you wish using df1[, value2 := NULL]

Resources