How to group by with if statement in R? - r

I have data containing four variables (id, quantity, weight, date) and i want to make packages of quantity=6 using just observations with quantity below 6, example : if i have 6 products of quantity 1 each,
i want in return just 1 product of quantity 6 where i sum(weight[which(qte)<6])
And do it for all elements of the table
I've tried this code but it's not working, can anyone help me please ?
poids = c()
qte =c()
dd = data.frame()
for (i in length(paquet)){
if(paquet$RealQuantity[i]+paquet$RealQuantity[i+1]==6){
poids[i] = sum(paquet$RealWeigth)
qte[i] = sum(paquet$RealQuantity)
dd = rbind(dd,data.frame(i=i,poids = poids[i],qte =qte[i]))
}
}
this is an example :
A tibble: 232 x 4
ProductID RealQuantity RealWeigth PickingDate
<dbl> <dbl> <dbl> <date>
1 1 5 0.296 2017-12-26
2 1 1 0.064 2018-01-05
3 1 1 0.061 2018-01-05
4 1 5 0.297 2018-01-10
5 1 5 0.298 2018-01-13
6 1 1 0.058 2018-01-16
7 1 3 0.172 2018-01-23
8 1 3 0.172 2018-01-23
the output expected is :
ProductID RealQuantity RealWeigth PickingDate
<dbl> <dbl> <dbl> <date>
1 1 6 0.36 2017-12-26
2 1 6 0.358 2018-01-05
3 1 6 0.356 2018-01-13
4 1 6 0.344 2018-01-23

This is an iterative solution that cannot guarantee everything is assigned a package of 6 items:
df <- structure(list(ProductID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RealQuantity = c(5, 1, 1, 5, 5, 1, 3, 3, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3), RealWeigth = c(0.296, 0.064, 0.061, 0.297, 0.298, 0.058, 0.172, 0.172, 0.177, 0.1695, 0.179, 0.18, 0.175, 0.301, 0.181, 0.178, 0.161, 0.178, 0.1775, 0.183), PickingDate = structure(c(17526, 17536, 17536, 17541, 17544, 17547, 17554, 17554, 17554, 17555, 17556, 17556, 17557, 17557, 17557, 17557, 17558, 17561, 17562, 17562), class = "Date")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
# for portion of data.frame with RealQuantity < 6:
df$group <- NA
df <- df[order(df$RealQuantity, decreasing=TRUE),]
gi <- 0
groupsize <- 0
while (TRUE) {
gi <- gi + 1
# find biggest, non-assigned row:
i <- min(which(is.na(df$group)))
if (is.infinite(i)) # nothing found; which would return a zero-length vector. But min(integer(0)) returns Inf.
break
groupsize <- df$RealQuantity[i]
df$group[i] <- gi
# find next line whos combined quantity might fit within "6":
while (TRUE) {
j <- min(which(is.na(df$group) & df$RealQuantity + groupsize <= 6))
if (is.infinite(j)) # nothing found
break
groupsize <- groupsize + df$RealQuantity[j]
df$group[j] <- gi
if (groupsize >= 6)
break
}
}
library(dplyr)
df %>% group_by(group) %>% summarise_at(vars(RealQuantity, RealWeigth), funs(sum))
or some more details:
df %>% group_by(group) %>% summarise(combined=paste(RealQuantity, collapse=', '), RealQuantity=sum(RealQuantity), RealWeigth=sum(RealWeigth), Firstdate=min(PickingDate))

Related

Extract columns from data frames in a list in a separate list of data frames

I have a list -cj1- with multiple data frames
dput(head(cj1[1:2]))
list(structure(list(individual = c("a12TTT.pdf", "a15.pdf", "a17.pdf",
"a18.pdf", "a21.pdf", "a2TTT.pdf", "a5.pdf", "B11.pdf", "B12.pdf",
"B13.pdf", "B22.pdf", "B24.pdf", "B4.pdf", "B7.pdf", "B8.pdf",
"cw10-1.pdf", "cw13-1.pdf", "cw15-1TTT.pdf", "cw17-1.pdf", "cw18.pdf",
"cw3.pdf", "cw4.pdf", "cw7_1TTT.pdf"), id = 1:23, Ntot = c(13,
9, 16, 15, 9, 13, 10, 10, 11, 10, 14, 10, 11, 12, 11, 10, 15,
12, 14, 11, 9, 10, 11), N1 = c(5, 5, 10, 11, 7, 9, 5, 5, 6, 8,
8, 8, 9, 8, 7, 1, 0, 6, 3, 4, 2, 4, 2), ND = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), N0 = c(8,
4, 6, 4, 2, 4, 5, 5, 5, 2, 6, 2, 2, 4, 4, 9, 15, 6, 11, 7, 7,
6, 9), score = c(5.06923076923077, 4.96666666666667, 9.925, 10.86,
6.83333333333333, 8.88461538461539, 5, 5, 5.97272727272727, 7.82,
7.95714285714286, 7.82, 8.80909090909091, 7.9, 6.91818181818182,
1.24, 0.3, 6, 3.17142857142857, 4.08181818181818, 2.16666666666667,
4.06, 2.19090909090909), propscore = c(0.389940828402367, 0.551851851851852,
0.6203125, 0.724, 0.759259259259259, 0.683431952662722, 0.5,
0.5, 0.54297520661157, 0.782, 0.568367346938776, 0.782, 0.800826446280992,
0.658333333333333, 0.628925619834711, 0.124, 0.02, 0.5, 0.226530612244898,
0.371074380165289, 0.240740740740741, 0.406, 0.199173553719008
), theta = c(-0.571211122446447, 0.418736780198501, 0.464533662219296,
0.760432013134893, 1.43961032059382, 0.935963883364303, 0.0742361005467161,
0.416783201347136, 0.232586422933618, 1.65345248955369, 0.178947462869717,
1.3980442736112, 1.5300599487058, 0.340087410746963, 0.616985944469495,
-1.73246102772711, -4.06186172096556, -0.347700710331151, -1.21009964741398,
0.239145600406579, -1.88836418690337, -0.276451472526056, -0.611455626388059
), se.theta = c(0.689550115014498, 0.689441554709003, 0.595659709892116,
0.609506508256404, 0.917792293663691, 0.652011367164736, 0.720534163064516,
0.695969555549033, 0.661019531367007, 0.87050969318314, 0.605775647419845,
0.797443937820774, 0.768436114096332, 0.695748274310803, 0.709380679025605,
1.00089414765463, 1.8701468050665, 0.68959824350285, 0.733014089189809,
0.656392513303483, 0.952935324276941, 0.71608982789968, 0.771906532861938
), outfit = c(1.24922700170817, 1.46067763769417, 0.915183304626819,
0.753992664091072, 0.37410361433915, 0.727316037037668, 0.616907868814702,
1.01528298230254, 1.01594232662062, 0.616808170683195, 0.646097057961938,
0.622993494551005, 0.807441271101246, 0.788526018181888, 1.2157399735092,
0.341189086206191, 0.021052091633073, 0.543024513106335, 1.04183076617928,
1.1772656963046, 0.736106160865241, 0.756316095787985, 0.58320701094964
), infit = c(1.4078580948461, 1.42854494963967, 1.09762978932861,
0.893957122448352, 0.64936943769433, 0.899191443180872, 0.724956556509282,
1.14975990693782, 1.08074439712469, 0.978248081241133, 0.755557633771936,
0.823903684368671, 0.911855771375284, 0.954272320131035, 0.926253596526142,
0.634052701587448, 0.0504659822408584, 0.712539957033173, 0.966034039620798,
1.1901663169553, 0.81371119642719, 0.817417869881874, 0.737574872116582
)), row.names = c(NA, -23L), class = "data.frame"), structure(list(
parlabel = c("Ties", "Home"), par = c("delta", "eta"), est = c(-43.5016417611571,
0.337872999554289), se = c(366043197.615422, 0.215169736220537
)), row.names = c(NA, -2L), class = "data.frame"))
Here is how data frames look:
head(cj1[[1]],2)
individual id Ntot N1 ND N0 score propscore theta se.theta outfit
1 a12TTT.pdf 1 13 5 0 8 5.069231 0.3899408 -0.5712111 0.6895501 1.249227
2 a15.pdf 2 9 5 0 4 4.966667 0.5518519 0.4187368 0.6894416 1.460678
infit
1 1.407858
2 1.428545
I would like to create a separate list -results1- that would contain data frames that would include columns 1 and 9 named individual and theta
I tried:
results1<-sapply(cj1, "[",c("individual","theta") )
Error in [.data.frame(X[[i]], ...) : undefined columns selected
library(dplyr)
> results1 <- lapply(cj1, function(x) x%>% select(individual,theta))
Error:
Can't subset columns that don't exist.
x Column individual doesn't exist.
Run rlang::last_error() to see where the error occurred.
I can subtract these columns from one data frame:
cj[[1]][c(1,9)]
I could not apply this to the whole list.
You can use the following solution. We use .x to refer to every individual element of your list. Here .x can be each of your data frames of which we would like to select only 2 columns c("individual","theta").
However, since only one of your data frames contains such column names I used keep function to actually keep only elements whose data frames contain the desired column name. Just bear in mind for this form of coding which is called purrr-style formula we need ~ before .x. So you use map function which is an equivalent to lapply from base R and use this syntax to apply whatever function on every individual elements (data frames here).
library(purrr)
cj1 %>%
map_if(~ all(c("individual","theta") %in% names(.x)),
~ .x %>% select(individual, theta)) %>%
keep(~ all(c("individual","theta") %in% names(.x)))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Or we can spare a line of code to be more concise:
cj1 %>%
keep(~ all(c("individual","theta") %in% names(.x))) %>%
map(~ .x %>% select(individual, theta))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Here is just another base R solution with a slightly different syntax. Just note that \(x) is equivalent to function(x) which is a new feature available as of R. 4.1.0.
cj1 |>
lapply(\(x) {
if(all(c("individual","theta") %in% names(x))) {
`[`(x, c("individual","theta"))
}
}
) -> cj2
cj2 <- cj2[-which(sapply(cj2, is.null))] |> as.data.frame()
In base R, you can try this solution with lapply -
cols <- c("individual","theta")
lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols])
#[[1]]
# individual theta
#1 a12TTT.pdf -0.5712
#2 a15.pdf 0.4187
#3 a17.pdf 0.4645
#4 a18.pdf 0.7604
#5 a21.pdf 1.4396
#6 a2TTT.pdf 0.9360
#7 a5.pdf 0.0742
#8 B11.pdf 0.4168
#9 B12.pdf 0.2326
#10 B13.pdf 1.6535
#11 B22.pdf 0.1789
#12 B24.pdf 1.3980
#13 B4.pdf 1.5301
#14 B7.pdf 0.3401
#15 B8.pdf 0.6170
#16 cw10-1.pdf -1.7325
#17 cw13-1.pdf -4.0619
#18 cw15-1TTT.pdf -0.3477
#19 cw17-1.pdf -1.2101
#20 cw18.pdf 0.2391
#21 cw3.pdf -1.8884
#22 cw4.pdf -0.2765
#23 cw7_1TTT.pdf -0.6115
#[[2]]
#NULL
If you want to drop the NULL lists you can add Filter -
Filter(length, lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols]))

R Dplyr solution for summarize_at correlation

I am attempting to calculate correlation by (group_by) MktDate, for all columns in a dataframe to another column (Security Return).
I have attempted a number of dplyr solutions and can't quite get the correlation example to work properly but have no issues getting an example using mean to work properly.
This works, to calculate mean by specified columns
MyMeanTest <- MyDataTest %>%
filter(MktDate >='2009-12-31') %>%
group_by(MktDate) %>%
summarize_at(c('RtnVol_EM','OCFROI_EM'),mean,na.rm=TRUE)
This does not work. essentially I want the correlation for the columns specified, grouped by MktDate with the column FwdRet_12M. I get the following error message -
Error in summarise_impl(.data, dots) :
Evaluation error: not all arguments have the same length.
MyCorTest <- MyDataTest %>%
group_by(MktDate) %>%
summarize_at(c('RtnVol_EM','OCFROI_EM'),funs(cor(.,MyDataTest$FwdRet_12M,use="pairwise.complete.obs", "spearman")))
With the code example above I should end with something like this
MktDate,RtnVol_EM,OCFROI_EM...
Here is some sample code that should help to understand the structure of the data and end objective.
MyDataTest <- structure(list(MktDate = structure(c(17896, 17896, 17896, 17896,
17927, 17927, 17927, 17927), class = "Date"), FwdRet = c(2, 3,
4, 5, 5, 2, 1, 4), Fact1 = c(10, 30, 20, 15, 12, 25, 26, 28),
Fact2 = c(100, 500, 300, 400, 150, 400, 430, 420)), .Names = c("MktDate",
"FwdRet", "Fact1", "Fact2"), row.names = c(NA, -8L), class = "data.frame")
When running the pairwise correlation grouped by date on that data set the following should be the result.
MktDate,Fact1,Fact2
12/31/18,.2,.4
1/31/19,.4,-.8
One possible approach would be to reshape your data so that you have the variable you always want in the correlation (FwdRet) in one column and the variable that changes in a separate column. Like so:
MyDataTest_reshape <- MyDataTest %>%
gather(factor, value, -MktDate, -FwdRet)
MyDataTest_reshape
MktDate FwdRet factor value
1 2018-12-31 2 Fact1 10
2 2018-12-31 3 Fact1 30
3 2018-12-31 4 Fact1 20
4 2018-12-31 5 Fact1 15
5 2019-01-31 5 Fact1 12
6 2019-01-31 2 Fact1 25
7 2019-01-31 1 Fact1 26
8 2019-01-31 4 Fact1 28
9 2018-12-31 2 Fact2 100
10 2018-12-31 3 Fact2 500
11 2018-12-31 4 Fact2 300
12 2018-12-31 5 Fact2 400
13 2019-01-31 5 Fact2 150
14 2019-01-31 2 Fact2 400
15 2019-01-31 1 Fact2 430
16 2019-01-31 4 Fact2 420
Then you can take that reshaped data and feed it into your correlation:
MyDataTest_reshape %>%
group_by(MktDate, factor) %>%
summarize(correlation = cor(FwdRet, value)) %>%
spread(factor, correlation)
# A tibble: 2 x 3
# Groups: MktDate [2]
MktDate Fact1 Fact2
<date> <dbl> <dbl>
1 2018-12-31 0.0756 0.529
2 2019-01-31 -0.627 -0.736
You can also do this all in one step, of course:
MyDataTest %>%
gather(factor, value, -MktDate, -FwdRet) %>%
group_by(MktDate, factor) %>%
summarize(correlation = cor(FwdRet, value)) %>%
spread(factor, correlation)
This works for me.
library(tidyverse)
MyDataTest <- structure(list(MktDate = structure(c(17896, 17896, 17896, 17896,
17927, 17927, 17927, 17927), class = "Date"), FwdRet = c(2, 3,
4, 5, 5, 2, 1, 4), Fact1 = c(10, 30, 20, 15, 12, 25, 26, 28),
Fact2 = c(100, 500, 300, 400, 150, 400, 430, 420)), .Names = c("MktDate",
"FwdRet", "Fact1", "Fact2"), row.names = c(NA, -8L), class = "data.frame")
MyDataTest %>%
group_by(MktDate) %>%
summarize_at(c("Fact1", "Fact2"), list(~cor(., FwdRet, use="pairwise.complete.obs", "spearman")))
#> # A tibble: 2 x 3
#> MktDate Fact1 Fact2
#> <date> <dbl> <dbl>
#> 1 2018-12-31 0.2 0.4
#> 2 2019-01-31 -0.4 -0.8

Mutate top n rows without throwing away the other rows

I have the following data.frame below. I would like to create a new column w (for weight). w should equal 1 / n for the industries that have the n highest returns for each given date and should equal 0 for the rest of the industries. I can group_by(date) and use top_n(3, wt = return) to filter the top industries and then mutate(w = 1/n), but how can I mutate without throwing away the other industries where w = 0?
structure(list(date = structure(c(16556, 16556, 16556, 16556,
16556, 16556, 16556, 16556, 16556, 16556, 16587, 16587, 16587,
16587, 16587, 16587, 16587, 16587, 16587, 16587, 16617, 16617,
16617, 16617, 16617, 16617, 16617, 16617, 16617, 16617), class = "Date"),
industry = c("Hlth", "Txtls", "BusEq", "Fin", "ElcEq", "Food",
"Beer", "Books", "Cnstr", "Carry", "Clths", "Txtls", "Fin",
"Games", "Cnstr", "Meals", "Hlth", "Hshld", "Telcm", "Rtail",
"Smoke", "Games", "Clths", "Rtail", "Servs", "Meals", "Food",
"Hlth", "Beer", "Trans"), return = c(4.89, 4.37, 4.02, 2.99,
2.91, 2.03, 2, 1.95, 1.86, 1.75, 4.17, 4.09, 1.33, 1.26,
0.42, 0.29, 0.08, -0.11, -0.45, -0.48, 9.59, 6, 5.97, 5.78,
5.3, 4.15, 4.04, 3.67, 3.51, 3.27)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 30 x 3
date industry return
<date> <chr> <dbl>
1 2015-05-01 Hlth 4.89
2 2015-05-01 Txtls 4.37
3 2015-05-01 BusEq 4.02
4 2015-05-01 Fin 2.99
5 2015-05-01 ElcEq 2.91
6 2015-05-01 Food 2.03
7 2015-05-01 Beer 2
8 2015-05-01 Books 1.95
9 2015-05-01 Cnstr 1.86
10 2015-05-01 Carry 1.75
# ... with 20 more rows
EDIT: How would you handle ties? Suppose there is a tie for third place. The third place weight should be split between 3rd and 4th place (assuming only 2 are tied) with weights of (1/n)/2. The 1st and 2nd place weights stay at 1/n.
EDIT: Suppose n = 3. The top 3 A2 values for each A1 should get a weight w of 1/3 if there are no ties. If there is a tie for 3rd place (T3), then we have (1st, 2nd, T3, T3) and I would like weights to be 1/3, 1/3, 1/6, 1/6 to maintain a total weight of 1. This is only for 3rd place however. (1st, T2, T2) should have weights of 1/3, 1/3, 1/3. (T1, T1, T2, T2) should have weights of 1/3, 1/3, 1/6, 1/6, etc.
structure(list(A1 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), A2 = c(1, 3, 3,
4, 5, 6, 7, 8, 8)), row.names = c(NA, -9L), class = "data.frame")
The output for df should be:
> df
A1 A2 w
1 A 1 0
2 A 3 0.1666
3 A 3 0.1666
4 A 4 0.3333
5 A 5 0.3333
6 B 6 0
7 B 7 0.3333
8 B 8 0.3333
9 B 8 0.3333
We could create a condition with ifelse. After grouping by 'date', arrange the dataset based on the 'date', and 'return' in descending order, then create the 'w' by creating the condition that if the row_number() is less than 'n', then divide 'return' by 'n' or else return 0
n <- 3
df1 %>%
group_by(date) %>%
arrange(date, -return) %>%
mutate(w = ifelse(row_number() <= n, return/n, 0))
If we are using top_n, then create the column 'w' in the filtered dataset and join with the original
df1 %>%
group_by(date) %>%
top_n(return, n = 3) %>%
mutate(w = return/n()) %>%
right_join(df1) %>%
mutate(w = replace_na(w, 0))
We can group by date then sort the return variable get the last 3 enteries (top 3) and return return/n or else 0.
library(dplyr)
n <- 3
df %>%
group_by(date) %>%
mutate(w = ifelse(return %in% tail(sort(return), n), return/n, 0))
# date industry return w
# <date> <chr> <dbl> <dbl>
# 1 2015-05-01 Hlth 4.89 1.63
# 2 2015-05-01 Txtls 4.37 1.46
# 3 2015-05-01 BusEq 4.02 1.34
# 4 2015-05-01 Fin 2.99 0
# 5 2015-05-01 ElcEq 2.91 0
# 6 2015-05-01 Food 2.03 0
# 7 2015-05-01 Beer 2 0
#....
The base R equivalent of the same logic using ave
ave(df$return, df$date, FUN = function(x) ifelse(x %in% tail(sort(x), n), x/n, 0))
EDIT
As mentioned in comments, in case of ties OP wants to return (1/n)/2 or divide by number of ties we have.
For this I have created a new easier dataframe which makes it easy to understand what is going on.
df <- data.frame(A1 = rep(c("A", "B"),c(5, 4)), A2 = 1:9)
df$A2[2] <- 3
If we use the current code it gives
df %>%
group_by(A1) %>%
mutate(w = ifelse(A2 %in% tail(sort(A2), n), A2/n, 0))
# A tibble: 9 x 3
# Groups: A1 [2]
# A1 A2 w
# <fct> <int> <dbl>
#1 A 1 0
#2 A 3 1
#3 A 3 1
#4 A 4 1.33
#5 A 5 1.67
#6 B 6 0
#7 B 7 2.33
#8 B 8 2.67
#9 B 9 3
which is not what we want. To avoid that, we can group by A2 again and for only those rows where w!=0 we divide it by number of occurrences of A2.
df %>%
group_by(A1) %>%
mutate(w = ifelse(A2 %in% tail(sort(A2), n), A2/n, 0)) %>%
group_by(A2) %>%
mutate(w1 = ifelse(w != 0, w/n(), w)) %>%
ungroup()
# A1 A2 w w1
# <fct> <dbl> <dbl> <dbl>
#1 A 1 0 0
#2 A 3 1 0.5
#3 A 3 1 0.5
#4 A 4 1.33 1.33
#5 A 5 1.67 1.67
#6 B 6 0 0
#7 B 7 2.33 2.33
#8 B 8 2.67 2.67
#9 B 9 3 3
Another EDIT
Turns out we just want to divide w only for the last group present. Moreover, the sum of all the w in each group should sum up to 1. For the updated dataset we can do
n <- 3
temp_df <- df %>%
group_by(A1) %>%
top_n(n, A2)
temp_df %>%
arrange(A1, A2) %>%
mutate(w = ifelse(A2 == A2[1],
(1 - (1/n * sum(A2 != A2[1])))/sum(A2 == A2[1]), 1/n)) %>%
bind_rows(anti_join(df, temp_df) %>%
mutate(w = 0)
) %>%
arrange(A1, A2)
# A1 A2 w
# <fct> <dbl> <dbl>
#1 A 1 0
#2 A 3 0.167
#3 A 3 0.167
#4 A 4 0.333
#5 A 5 0.333
#6 B 6 0
#7 B 7 0.333
#8 B 8 0.333
#9 B 8 0.333
Let's try another variation where we keep all the values of the group same.
df1 = df
df1$A2[6:9] <- 10
temp_df <- df1 %>%
group_by(A1) %>%
top_n(n, A2)
temp_df %>%
arrange(A1, A2) %>%
mutate(w = ifelse(A2 == A2[1],
(1 - (1/n * sum(A2 != A2[1])))/sum(A2 == A2[1]), 1/n)) %>%
bind_rows(anti_join(df1, temp_df) %>%
mutate(w = 0)
) %>%
arrange(A1, A2)
# A1 A2 w
# <fct> <dbl> <dbl>
#1 A 1 0
#2 A 3 0.167
#3 A 3 0.167
#4 A 4 0.333
#5 A 5 0.333
#6 B 10 0.25
#7 B 10 0.25
#8 B 10 0.25
#9 B 10 0.25
The logic is we select the top 3 A2 values along with their groups using top_n. Using anti_join we get all the rows which are not in top 3 and assign a fixed weight w to them as 0. For the rows which are included in top 3 we get the last group rows and assign them the weight which is remaining after assigning the weights to non-last groups.

Apply a function to a subset of many columns in R

How do I apply a function to many columns of grouped rows? For example;
library(tidyverse)
data <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 2.01, -0.43, -0.52,
"01/01/18", 3, "Bounce", 2, 1.94, 1.53, 1.92) %>%
mutate_at(vars(Date, Seq1, Component, Seq2), funs(factor))
Each column of X values (many more columns, truncated here for clarity) is grouped into Date, Seq1, Component, and Seq2. While Component "Smooth" and Seq1 "NA" are constant, within Component "Bounce" level there are multiple Seq2 levels e.g. "1", "2", etc.
How do I sum each X column, always the constant "NA" with each level of Seq2?
The desired results is:
expected <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 5.49, 3.49, 1.77,
"01/01/18", 3, "Bounce", 2, 5.42, 4.59, 3.17)
The following example only adds each Seq1 level.
data %>%
group_by(Date, Seq1) %>%
mutate_at(vars(starts_with("X")), funs(sum(.)))
#> # A tibble: 5 x 7
#> # Groups: Date, Seq1 [3]
#> Date Seq1 Component Seq2 X1 X2 X3
#> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl>
#> 1 01/01/18 1 Smooth <NA> 3.98 2.75 1.82
#> 2 01/01/18 2 Smooth <NA> 1.02 0.02 -0.04
#> 3 01/01/18 3 Smooth <NA> 7.43 4.16 2.65
#> 4 01/01/18 3 Bounce 1 7.43 4.16 2.65
#> 5 01/01/18 3 Bounce 2 7.43 4.16 2.65
I am certain there is solution within the purrr or apply function family, however, I have been unsuccessful (for days) in solving this example. The actual data has about 180 X columns, with hundreds of Date and Seq1 combinations, and multiple Seq2 levels.
A similar example could be Summing Multiple Groups of Columns, How to apply a function to a subset of columns in r?, or even perhaps https://github.com/jennybc/row-oriented-workflows.
Created on 2018-10-23 by the reprex package (v0.2.1)
Here's my solution. This problem is not really a purrr task, because there is nothing really that you want to map a single function to. Instead, what I understand the problem to be is that you want to match each X value in a Bounce row with the corresponding Smooth row X values of the same Date and Seq1 (and there is only one such row). This means that it is really a merging or joining problem, and then the approach is to set up the join so that you can match the right values and do the sum. So I go as follows:
Split the data into the Smooth rows and the Bounce rows and gather so that all the X values are in one column
Join the smooths onto the bounces with a left_join, so each original Bounce row now has its corresponding Smooth.
mutate the sum into a new column and select/rename the columns to be as in the original
bind_rows to join the newly summed bounces and spread to return to the original layout.
This should be robust to any number of Date, Seq1, Seq2 and X values.
library(tidyverse)
data <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 2.01, -0.43, -0.52,
"01/01/18", 3, "Bounce", 2, 1.94, 1.53, 1.92)
smooths <- data %>%
filter(Component == "Smooth") %>%
gather(X, val, starts_with("X"))
bounces <- data %>%
filter(Component == "Bounce") %>%
gather(X, val, starts_with("X")) %>%
left_join(smooths, by = c("Date", "Seq1", "X")) %>%
mutate(val = val.x + val.y) %>%
select(Date, Seq1, Component = Component.x, Seq2 = Seq2.x, X, val)
bounces %>%
bind_rows(smooths) %>%
spread(X, val)
#> # A tibble: 5 x 7
#> Date Seq1 Component Seq2 X1 X2 X3
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 01/01/18 1 Smooth NA 3.98 2.75 1.82
#> 2 01/01/18 2 Smooth NA 1.02 0.02 -0.04
#> 3 01/01/18 3 Bounce 1 5.49 2.63 0.73
#> 4 01/01/18 3 Bounce 2 5.42 4.59 3.17
#> 5 01/01/18 3 Smooth NA 3.48 3.06 1.25
Created on 2018-10-31 by the reprex package (v0.2.1)

Calculate and output the date of customer's first order

Data:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","1.1.12","1.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
Expected outcome:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","2.1.12","3.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99),
DateOfFirstOrderofCustomer = c("1.1.12", "2.1.12", "3.1.12", "1.1.12", "1.1.12", "3.1.12", "2.1.12", "2.1.12", "1.1.12", "1.1.12"))
For Understanding:
The orderID is continuous. Products orderd from the same customerID at the same day get the same orderID. When the same customer orders products at another day he/she it´s a new orderID.
I want to add an additional column for every row/entry which contains the date of the customer's first order (e.g. customer 1 (customerID 1) made his first order on 1.1.12 so this date is entered in all orders from this customer). How can we do this?
The original data has about 500k rows: so plz give a solution which needs only little perfomance.
With just base R functions:
# convert the date column to date-format
DB$orderDate <- as.Date(DB$orderDate, format('%d.%m.%y'))
# get the first date for each customer
DB$DateFirstOrder <- with(DB, ave(orderDate, customerID, FUN = min))
the result is then (using the data of Mike Spencer):
> DB
orderID orderDate itemID customerID itemPrice DateFirstOrder
1 1 2012-01-01 2 1 9.99 2012-01-01
2 2 2012-01-04 3 2 14.99 2012-01-04
3 3 2012-01-06 2 3 9.99 2012-01-06
4 4 2012-01-13 5 1 19.99 2012-01-01
5 4 2012-01-13 12 1 29.99 2012-01-01
6 5 2012-01-12 4 3 4.99 2012-01-06
7 6 2012-01-10 2 2 9.99 2012-01-04
8 6 2012-01-10 3 2 14.99 2012-01-04
9 7 2012-01-21 1 1 49.99 2012-01-01
10 8 2012-01-24 5 1 19.99 2012-01-01
For the fastest solution, I would recommend the data.table package. To get the desired result with this package, you need to do:
library(data.table)
setDT(DB)[, orderDate := as.Date(orderDate, format('%d.%m.%y'))
][, DateFirstOrder := min(orderDate), by = customerID]
I've changed your input data as those you provided all had the same date of first purchase from each customer, so it was impossible to tell if the code worked. The example uses dplyr, you could also use tapply, but you'd need to reformat the named vector.
# Dummy data
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
# -------------------------------------------
# Change dates to a readable format
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
# -------------------------------------------
library(dplyr)
DB <- DB %>%
group_by(customerID) %>%
mutate(DateOfFirstOrderofCustomer=min(orderDate))
I am using plyr package. rest everything is same.
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
install.packages("plyr")
library(plyr)
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
DB = ddply(DB, .(customerID), mutate, DateOfFirstOrderofCustomer = min(orderDate))

Resources