Creating an "other" field - r

Right now, I have the following data.frame which was created by original.df %.% group_by(Category) %.% tally() %.% arrange(desc(n)).
DF <- structure(list(Category = c("E", "K", "M", "L", "I", "A",
"S", "G", "N", "Q"), n = c(163051, 127133, 106680, 64868, 49701,
47387, 47096, 45601, 40056, 36882)), .Names = c("Category",
"n"), row.names = c(NA, 10L), class = c("tbl_df", "tbl", "data.frame"
))
Category n
1 E 163051
2 K 127133
3 M 106680
4 L 64868
5 I 49701
6 A 47387
7 S 47096
8 G 45601
9 N 40056
10 Q 36882
I want to create an "Other" field from the bottom ranked Categories by n. i.e.
Category n
1 E 163051
2 K 127133
3 M 106680
4 L 64868
5 I 49701
6 Other 217022
Right now, I am doing
rbind(filter(DF, rank(rev(n)) <= 5),
summarise(filter(DF, rank(rev(n)) > 5), Category = "Other", n = sum(n)))
which collapses all categories not in the top 5 into the Other category.
But I'm curious whether there's a better way in dplyr or some other existing package. By "better" I mean more succinct/readable. I'm also interested in methods with cleverer or more flexible ways to choose Other.

This is another approach, assuming that each category (of the top 5 at least) only occurs once:
df %.%
arrange(desc(n)) %.% #you could skip this step since you arranged the input df already according to your question
mutate(Category = ifelse(1:n() > 5, "Other", Category)) %.%
group_by(Category) %.%
summarize(n = sum(n))
# Category n
#1 E 163051
#2 I 49701
#3 K 127133
#4 L 64868
#5 M 106680
#6 Other 217022
Edit:
I just noticed that my output is not order by decreasing n any more. After running the code again, I found out that the order is kept until after the group_by(Category) but when I run the summarize afterwards, the order is gone (or rather, it seems to be ordered by Category). Is that supposed to be like that?
Here are three more ways:
m <- 5 #number of top results to show in final table (excl. "Other")
n <- m+1
#preserves the order (or better: reesatblishes it by index)
df <- arrange(df, desc(n)) %.% #this could be skipped if data already ordered
mutate(idx = 1:n(), Category = ifelse(idx > m, "Other", Category)) %.%
group_by(Category) %.%
summarize(n = sum(n), idx = first(idx)) %.%
arrange(idx) %.%
select(-idx)
#doesnt preserve the order (same result as in first dplyr solution, ordered by Category)
df[order(df$n, decreasing=T),] #this could be skipped if data already ordered
df[n:nrow(df),1] <- "Other"
df <- aggregate(n ~ Category, data = df, FUN = "sum")
#preserves the order (without extra index)
df[order(df$n, decreasing=T),] #this could be skipped if data already ordered
df[n:nrow(df),1] <- "Other"
df[n,2] <- sum(df$n[df$Category == "Other"])
df <- df[1:n,]

Different package/different syntax version:
library(data.table)
dt = as.data.table(DF)
dt[order(-n), # your data is already sorted, so this does nothing for it
if (.BY[[1]]) .SD else list("Other", sum(n)),
by = 1:nrow(dt) <= 5][, !"nrow", with = F]
# Category n
#1: E 163051
#2: K 127133
#3: M 106680
#4: L 64868
#5: I 49701
#6: Other 217022

This function modifies a column, replacing the infrequent entries with Other, either by specifying a minimum frequency, or by specifying the resultant number of categories intended.
#' #title Group infrequent entries into 'Other category'
#' #description Useful when you want to constrain the number of unique values in a column.
#' #param .data Data containing variable.
#' #param var Variable containing infrequent entries, to be collapsed into "Other".
#' #param n Threshold for total number of categories above "Other".
#' #param count Threshold for total count of observations before "Other".
#' #param by Extra variables to group by when calculating \code{n} or \code{count}.
#' #param copy Should \code{.data} be copied? Currently only \code{TRUE} is supported.
#' #param other.category Value that infrequent entries are to be collapsed into. Defaults to \code{"Other"}.
#' #return \code{.data} but with \code{var} changed to be grouped into smaller categories.
#' #export
mutate_other <- function(.data, var, n = 5, count, by = NULL, copy = TRUE, other.category = "Other"){
stopifnot(is.data.table(.data),
is.character(other.category),
identical(length(other.category), 1L))
had.key <- haskey(.data)
if (!isTRUE(copy)){
stop("copy must be TRUE")
}
out <- copy(.data)
if (had.key){
orig_key <- key(out)
} else {
orig_key <- "_order"
out[, "_order" := 1:.N]
setkeyv(out, "_order")
}
if (is.character(.data[[var]])){
stopifnot(!("nvar" %in% names(.data)),
var %in% names(.data))
N <- .rank <- NULL
n_by_var <-
out %>%
.[, .N, keyby = c(var, by)] %>%
.[, .rank := rank(-N)]
out <- merge(out, n_by_var, by = c(var, by))
if (missing(count)){
out[, (var) := dplyr::if_else(.rank <= n, out[[var]], other.category)]
} else {
out[, (var) := dplyr::if_else(N >= count, out[[var]], other.category)]
}
out <-
out %>%
.[, N := NULL] %>%
.[, .rank := NULL]
setkeyv(out, orig_key)
if (!had.key){
out[, (orig_key) := NULL]
setkey(out, NULL)
}
out
} else {
warning("Attempted to use by = on a non-character vector. Aborting.")
return(.data)
}
}
https://github.com/HughParsonage/hutils/blob/master/R/mutate_other.R

Related

Calculating Highest In, First Out on trades

I am trying to use the Highest In, First Out accounting method on trades.
Highest In, First Out means that when you sell, you sell your most expensive shares first.
Here are my buys and sells (example borrowed from R calculate aggregate gains or loss using FIFO method - this is a similar, but different problem):
buy = data.frame(BuyTransactionID = c(1:10),
Ticker=c(rep('MSFT',4),rep('AMZN',3),rep('DOCU',3)),
Date=c(rep('01-01-2018',2),rep('01-14-2020',2),rep('01-01-2018',2),rep('01-14-2020',1),'01-01-2018','03-15-2020','04-06-2020'),
Price=c(100,102,102,107,2000,2010,2011,197,182,167),
Quantity=c(10,10,5,5,1,1,2,12,15,15))
sell = data.frame(SellTransactionID=c(1:7),
Ticker=c('MSFT','MSFT','AMZN','AMZN','DOCU','DOCU','DOCU'),
Date=c('01-07-2020','01-20-2020','01-01-2020','01-30-2020','01-15-2020','04-10-2020','04-20-2020'),
Price=c(97,110,2100,2050,210,205,225),
Quantity=c(15,12,1,3,10,5,3))
Here are the rules:
You sell the most expensive (highest price) shares first.
You cannot sell shares before you purchased them
You cannot sell the same shares multiple times
Example problem:
The first sale (SellTransactionID = 1) is 15 shares of MSFT on 01-07-2020. So, any purchase made before that date can be sold. Based on date, the eligible shares to be sold are those from BuyTransactionID 1 and 2. BuyTransactionID 2 is the highest price. Therefore, all 10 shares of BuyTransactionID 2 are sold and the remaining 5 shares come from BuyTransactionID 1.
Desired output:
'Date Sold' = the date sold (self-explanatory);
'Ticker' = the ticker sold;
'Proceeds' = the total dollar amount sold;
'Cost basis' = a weighted average of the shares sold.
Example solution:
This is the solution for SellTransactionID 1. A properly solution automates this and calculates for all SellTransactionIDs.
result <- data.frame(SellDate = '01-07-2020', Ticker = "MSFT", Proceeds = 1455, CostBasis = 101.33)
Cost Basis Example:
Cost basis is calculated as a weighted average. For the preceding example, Cost Basis is calculated as such: (Quantity1 * Price1 + Quanity2 * Price2 + .....)/sum of all Quantity(s)
So for example above: (10 * 102 + 5 * 100)/15
The answer by #DPH is excellent, but unfortunately not quite accurate enough. I will explain why.
Here is a new dataset where all the purchases precede the sales:
buy = data.frame(BuyTransactionID = c(1:10),
Ticker=c(rep('MSFT',10)),
Date=c(rep('01-01-2020',10)),
Price=c(100,102,102,107,105,111,109,112,115,106),
Quantity=c(10,10,5,5,1,1,2,12,15,15))
sell = data.frame(SellTransactionID=c(1:4),
Ticker=c('MSFT','MSFT','MSFT', 'MSFT'),
Date=c('01-07-2020','01-20-2020','01-21-2020',
'01-22-2020'),
Price=c(120,119,117, 121),
Quantity=c(7,12,1, 5))
If you apply the solution from #DPH, you will get this result:
Notice that the 'Remain_Price' does not change, nor does the 'Sales_Cost' for the last three transactions. This happens because the function determines how many shares remain after the first sale and what the average price of the remaining shares is. The shares purchased preceding the first sale can no longer be sold individually. They are now treated as a single entity with an average price and the remaining number of shares.
For example, a total of 76 shares were bought in this example. The first sale sells 7 shares. Now, 69 shares remain as seen in 'Remain_Qtd'. An average price is calculated for those remaining shares - that price is $106.5652. Now, the process considers all 69 shares to be priced at $106.5652 and the remaining sales reduce the quantity of 'Remain_Qtd', but does not change the 'Remain_Price'. The remaining shares can no longer be considered at the price that they were bought at, they are collectively part of the remaining shares and the average remaining price.
This occurs because of the object dfo and the recycling of dfo in the object sdf. In particular, this line calculates an average remaining price that is then recycled through dfo and sdf.
Price = (sum(ip * iq) - v) / sum(sdf$Quantity)
and Quantity = sum(sdf$Quantity) adds together all the remaining shares.
I think the answer by #DPH is brilliant, but hope that it can be modified to treat each purchase individually rather than aggregating past purchases.
If I understood your problem correctly this is one possible solution. In resume I am combinig the sales and buys data and group it in sales blocks (given by the sales ID). This assumes that the order of sales IDs is according to the date column. I then loop over these sales blocks sequentially and write the intermediate result to a individual dataframe. For each sales block processing this result dataframe is filtered for the last sales block result of the same ticker. This means sales quantity must not be larger than available quantity according to the timeline (since you can not sell what you not have this should not be of concern anyhow I have to point it out as a possible limitation)
The proposed loop solution 1 is not the best way to work data in R since it is a loop, which grows a data.frame. Since you listed the purrr tag I adapted the code for the second part of the answer to work with the map() function.
Before we get to the actual coding lets prepare the data first (need for both parts of the answer the same way):
library(tidyverse) # need dplyr, using lubridate for date parsing and tidyr to fill data.frame
# include identifier variable, manipulate column name and union sells and buy
df <- dplyr::mutate(buy, io = "i") %>%
dplyr::rename(TID = BuyTransactionID) %>%
dplyr::union(dplyr::mutate(sell, io = "o") %>%
dplyr::rename(TID = SellTransactionID)) %>%
# sort the data
dplyr::arrange(Ticker, Date) %>%
# make Qtd negative for sales, remove buy IDs, parce Date and set up helper column
dplyr::mutate(Quantity = ifelse(io == "o", -1 * Quantity, Quantity),
TID = ifelse(io == "i", NA, TID),
Date = lubridate::mdy(Date),
hprice = NA_real_) %>%
# group data to fill backwards per group
dplyr::group_by(Ticker) %>%
tidyr::fill(TID, .direction = "up") %>%
# ungroup to prevent unwanted behaviour downstream
dplyr::ungroup()
1 Standard loop
dfo <- df[0, ] # empty copy of df
for (i in sort(unique(df$TID))) {
# which ticker are we working with at this sale
t <- unique(df[df$TID == i, ]$Ticker)
# bind data from last sale of this ticker to current sale
sdf <- rbind(dplyr::slice_max(dfo[dfo$Ticker == t, ], "Date"),
df[df$TID == i, ])
# current sales quantiy as positive value
o1 <- abs(sdf[sdf$io == "o", ]$Quantity)
# copy to use for greedy algo
o2 <- o1
# vectors of price and qtd of bought shares at this sale, having price in decreasing order
ip <- sdf[sdf$io == "i", ] %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Price)
iq <- sdf[sdf$io == "i", ] %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Quantity)
# total value of shares in greedy
v <- 0
# loop to run over bought prices and quantities do reduce from sold qtd per sales block
# check if shares block is larger then remaining sales qtd to break loop
for(l in 1:length(ip)){
if(o2 < iq[l]){
v <- v + ip[l] * o2
break
}else{
o2 <- o2 - iq[l]
v <- v + ip[l] * iq[l]
}
}
# fill sales block frame and bind to output df
dfo <- rbind(dfo,
data.frame(TID = i,
Ticker = t,
Date = max(sdf$Date),
Price = (sum(ip * iq) - v) / sum(sdf$Quantity),
Quantity = sum(sdf$Quantity),
io = "i",
hprice = v / o1))
}
# format, join original data and calculate result per Sales block
dplyr::select(dfo, Ticker, Date, Remain_Price = Price, Remain_Qtd = Quantity, Sales_Cost = hprice) %>%
dplyr::left_join(dplyr::select(df, Ticker, Date, Sales_Price = Price, Sales_Qtd = Quantity), by = c("Ticker", "Date")) %>%
dplyr::mutate(Result = abs(Sales_Qtd) * (Sales_Price - Sales_Cost))
Ticker Date Remain_Price Remain_Qtd Sales_Cost Sales_Price Sales_Qtd Result
1 MSFT 2020-01-07 100.0000 5 101.3333 97 -15 -65
2 MSFT 2020-01-20 100.0000 3 103.7500 110 -12 75
3 AMZN 2020-01-01 2000.0000 1 2010.0000 2100 -1 90
4 AMZN 2020-01-30 NaN 0 2007.3333 2050 -3 128
5 DOCU 2020-01-15 197.0000 2 197.0000 210 -10 130
6 DOCU 2020-04-10 173.6667 27 188.0000 205 -5 85
7 DOCU 2020-04-20 0.0000 -1 131.3333 225 -3 281
2 loop rephrase as purrr solution (be aware of the global assignment operartor (<<- instead of <-) for assignment of dfo at end of function)
# rephrase loop as function
myfun <- function(i){
# which ticker are we working with at this sale
t <- unique(df[df$TID == i, ]$Ticker)
# bind data from last sale of this ticker to current sale
sdf <- rbind(dplyr::slice_max(dfo[dfo$Ticker == t, ], "Date"),
df[df$TID == i, ])
# current sales quantiy as positive value
o1 <- abs(sdf[sdf$io == "o", ]$Quantity)
# copy to use for greedy algo
o2 <- o1
# vectors of price and qtd of bought shares at this sale, having price in decreasing order
ip <- sdf[sdf$io == "i", ] %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Price)
iq <- sdf[sdf$io == "i", ] %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Quantity)
# total value of shares in greedy
v <- 0
# loop to run over bought prices and quantities do reduce from sold qtd per sales block
# check if shares block is larger then remaining sales qtd to break loop
for(l in 1:length(ip)){
if(o2 < iq[l]){
v <- v + ip[l] * o2
break
}else{
o2 <- o2 - iq[l]
v <- v + ip[l] * iq[l]
}
}
# fill sales block frame and bind to output df
dfo <<- rbind(dfo,
data.frame(TID = i,
Ticker = t,
Date = max(sdf$Date),
Price = (sum(ip * iq) - v) / sum(sdf$Quantity),
Quantity = sum(sdf$Quantity),
io = "i",
hprice = v / o1))
}
# empty copy of df
dfo <- df[0, ]
purrr::map(sort(unique(df$TID)), ~ myfun(.x))[[max(df$TID)]] %>%
dplyr::select(Ticker, Date, Remain_Price = Price, Remain_Qtd = Quantity, Sales_Cost = hprice) %>%
dplyr::left_join(dplyr::select(df, Ticker, Date, Sales_Price = Price, Sales_Qtd = Quantity), by = c("Ticker", "Date")) %>%
dplyr::mutate(Result = abs(Sales_Qtd) * (Sales_Price - Sales_Cost))
Ticker Date Remain_Price Remain_Qtd Sales_Cost Sales_Price Sales_Qtd Result
1 MSFT 2020-01-07 100.0000 5 101.3333 97 -15 -65
2 MSFT 2020-01-20 100.0000 3 103.7500 110 -12 75
3 AMZN 2020-01-01 2000.0000 1 2010.0000 2100 -1 90
4 AMZN 2020-01-30 NaN 0 2007.3333 2050 -3 128
5 DOCU 2020-01-15 197.0000 2 197.0000 210 -10 130
6 DOCU 2020-04-10 173.6667 27 188.0000 205 -5 85
7 DOCU 2020-04-20 0.0000 -1 131.3333 225 -3 281
EDIT
To keep track of the remaining stocks we need a second df to hold the current portfolio data. I did not optimize the code and editted only the loop, the purrr adaption should be pretty straight foreward though.
library(tidyverse) # need dplyr, using lubridate for date parsing and tidyr to fill data.frame
# include identifier variable, manipulate column name and union sells and buy
df <- dplyr::mutate(buy, io = "i") %>%
dplyr::rename(TID = BuyTransactionID) %>%
dplyr::union(dplyr::mutate(sell, io = "o") %>%
dplyr::rename(TID = SellTransactionID)) %>%
# sort the data
dplyr::arrange(Ticker, Date) %>%
# make Qtd negative for sales, remove buy IDs, parce Date and set up helper column
dplyr::mutate(Quantity = ifelse(io == "o", -1 * Quantity, Quantity),
TID = ifelse(io == "i", NA, TID),
Date = lubridate::mdy(Date),
hprice = NA_real_) %>%
# group data to fill backwards per group
dplyr::group_by(Ticker) %>%
tidyr::fill(TID, .direction = "up") %>%
# ungroup to prevent unwanted behaviour downstream
dplyr::ungroup()
dfo <- df[0, ] # empty copy of df
dfh <- df[0, c("Ticker", "Price", "Quantity", "io")] # to hold current stock aka portfolio
for (i in sort(unique(df$TID))) {
# which ticker are we working with at this sale
t <- unique(df[df$TID == i, ]$Ticker)
# bind data from last sale of this ticker to current sale
sdf <- rbind(dplyr::slice_max(dfo[dfo$Ticker == t, ], "Date"),
df[df$TID == i, ])
# bind data from current portfolio to buys between last and current sale (new port folio before sale)
sdfh <- rbind(dfh[dfh$Ticker == t, ],
df[df$TID == i & df$io == "i", c("Ticker", "Price", "Quantity", "io")])
# current sales quantiy as positive value
o1 <- abs(sdf[sdf$io == "o", ]$Quantity)
# copy to use for greedy algo
o2 <- o1
# vectors of price and qtd of bought shares at this sale, having price in decreasing order
ip <- sdfh %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Price)
iq <- sdfh %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Quantity)
ips <- ip
iqs <- iq
# total value of shares in greedy
v <- 0
# loop to run over bought prices and quantities do reduce from sold qtd per sales block
# check if shares block is larger then remaining sales qtd to break loop
for(l in 1:length(ip)){
if(o2 < iq[l]){
v <- v + ip[l] * o2
iqs[1] <- iqs[1] - o2
break
}else{
o2 <- o2 - iq[l]
v <- v + ip[l] * iq[l]
ips <- ips[-1]
iqs <- iqs[-1]
}
}
dfh <- rbind(dfh[dfh$Ticker != t, ],
data.frame(Ticker = t,
Price = ips,
Quantity = iqs,
io = "i"))
# fill sales block frame and bind to output df
dfo <- rbind(dfo,
data.frame(TID = i,
Ticker = t,
Date = max(sdf$Date),
Price = sum(ips * iqs) / sum(iqs),
Quantity = sum(iqs),
io = "i",
hprice = v/o1))
}
dfo
TID Ticker Date Price Quantity io hprice
1 1 MSFT 2020-01-07 106.5652 69 i 115
2 2 MSFT 2020-01-20 105.0000 57 i 114
3 3 MSFT 2020-01-21 104.8750 56 i 112
4 4 MSFT 2020-01-22 104.1765 51 i 112
Here is the final working solution that I have come to with the help of #DPH. I have made a couple of changes to #DPH's edited solution.
The edited solution does not work when all the shares of a stock are sold for multiple reasons including the dfh object. The updates solution does work with the modified dateset that I provided but not the original dateset. I have modified the answer so that it works when all shares are sold.
I have modified the result to include the dates of the purchases. This will be important for determining whether the sale is long term or short term capital gains.
I have removed tickers from stocks that have been purchased but not sold, as those will break the script
I have applied purrr to the updated solution to avoid looping.
I have changed the base subsetting (i.e., df[]) to dplyr subsetting (i.e., df %>% filter(). For some reason the base subsetting was resulting in rows with NA values in my actual dataset even though it did not cause that in the sample dataset. The NA rows caused the solution not to work.
data frame prep:
df <- buy %>% filter(Ticker %in% unique(sell$Ticker)) %>% dplyr::mutate(io = "i") %>%
dplyr::rename(TID = BuyTransactionID) %>%
dplyr::union(dplyr::mutate(sell, io = "o") %>%
dplyr::rename(TID = SellTransactionID)) %>%
# sort the data
dplyr::arrange(Ticker, Date) %>%
# make Qtd negative for sales, remove buy IDs, parce Date and set up helper column
dplyr::mutate(Quantity = ifelse(io == "o", -1 * Quantity, Quantity),
TID = ifelse(io == "i", NA, TID),
Date = lubridate::mdy(Date),
hprice = NA_real_) %>%
dplyr::arrange(Ticker, Date) %>%
# group data to fill backwards per group
dplyr::group_by(Ticker) %>%
tidyr::fill(TID, .direction = "up") %>%
# ungroup to prevent unwanted behaviour downstream
dplyr::ungroup()
df$Dates_bought <- NA
function and purrr:
# rephrase loop as function
myfun <- function(i){
# which ticker are we working with at this sale
t <- unique(df[df$TID == i, ]$Ticker)
# bind data from last sale of this ticker to current sale
sdf <- rbind(dplyr::slice_max(dfo[dfo$Ticker == t, ], "Date"),
df %>% filter(TID == i))
sdfh <- rbind(dfh %>% filter(Ticker == t),
df %>% filter(TID == i & io == "i") %>% select(c("Ticker", "Date", "Price", "Quantity", "io")))
# current sales quantiy as positive value
o1 <- abs(sdf %>% filter(io == "o") %>% pull(Quantity))
# copy to use for greedy algo
o2 <- o1
# vectors of price and qtd of bought shares at this sale, having price in decreasing order
ip <- sdfh %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Price)
iq <- sdfh %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Quantity)
date <- sdfh %>% dplyr::arrange(desc(Price)) %>% dplyr::pull(Date)
ips <- ip
iqs <- iq
dates <- date
# total value of shares in greedy
v <- 0
# loop to run over bought prices and quantities do reduce from sold qtd per sales block
# check if shares block is larger then remaining sales qtd to break loop. Modifications
# to make dates work properly.
for(l in 1:length(ip)){
if(o2 < iq[l]){
v <- v + ip[l] * o2
iqs[1] <- iqs[1] - o2
dates2 <- if(o2 == 0) dates else dates[-1]
break
}else{
o2 <- o2 - iq[l]
v <- v + ip[l] * iq[l]
ips <- ips[-1]
iqs <- iqs[-1]
dates <- dates[-1]
dates2 <- dates
}
}
# Needs to have the if else statements because when all shares are sold, the length
# of dates, ips, and iqs is 0, whereas Ticker and io are length 1.
dfh <<- rbind(dfh[dfh$Ticker != t, ],
data.frame(Ticker = if(length(ips) == 0) numeric(length = 0L) else t,
Date = dates,
Price = ips,
Quantity = iqs,
io = if(length(ips) == 0) numeric(length = 0L) else "i"))
# fill sales block frame and bind to output df
dfo <<- rbind(dfo,
data.frame(TID = i,
Ticker = t,
Date = max(sdf$Date),
Dates_bought = paste(date[seq(length(date)-length(dates2))], collapse = ","),
Price = sum(ips * iqs) / sum(iqs),
Quantity = sum(iqs),
io = "i",
hprice = v / o1))
}
# empty copy of df
dfo <- df[0, ] # empty copy of df
dfh <- df[0, c("Ticker", "Date", "Price", "Quantity", "io")] # to hold current stock aka portfolio
hifo <- purrr::map(sort(unique(df$TID)), ~ myfun(.x))[[length(unique(df$TID))]] %>%
dplyr::select(Ticker, Date, Dates_bought, Remain_Price = Price, Remain_Qtd = Quantity, Sales_Cost = hprice) %>%
dplyr::left_join(dplyr::select(df, Ticker, Date, Sales_Price = Price, Sales_Qtd = Quantity), by = c("Ticker", "Date")) %>%
dplyr::mutate(Result = abs(Sales_Qtd) * (Sales_Price - Sales_Cost))
Let me know if anyone has any issues. I'd like to get this into a shinyapp and maybe develop it more. Let me know if you're interested in collaborating.

R, interval, mutate

I have a dataset about animals.
library(tidyverse)
a <- c("Date", "Specie", "Number")
b <- c("2020-01-01", "Dog", "3")
c <- c("2020-01-02", "Dog", "4")
d <- c("2020-01-03", "Dog", "5")
e <- c("2020-01-04", "Dog", "6")
f <- c("2020-01-01", "Cat", "3")
g <- c("2020-01-02", "Cat", "7")
h <- c("2020-01-03", "Cat", "8")
i <- c("2020-01-04", "Cat", "10")
df <- as.data.frame(rbind(b, c, d, e, f, g, h, i))
names(df) <- a
df$Date <- as.Date(df$Date)
df$Number <- as.integer(df$Number)
start <- as.Date("2020-01-02")
end <- as.Date("2020-01-04")
df %>%
filter(Date >= start & Date <= end) %>%
group_by(Specie) %>%
summarise(new = prod(10 + Number), .groups = "drop")
The goal is to create a new variable that gives me: (using tidyverse)
For each specie, between 2020-01-02 and 2020-01-04 (included), I want a new variable that is the product of (10+number of dead animals that day).
For-example, for dogs it would be (10+4)(10+5)(10+6).
Same for all specie.
Please note that for some specie, I don't have the number of dead animals during all the days of the interval.
Is dropping them the best option?
If yes, how do you do it.
Note that the code filters and hence keeps only my dataset for the dates specify. I want to return the output that the code delivers but in my original dataset.
That is, the output I get should be a new variable (mutate) for all species. And not a subset of my dataset.
I did a left-join to merge the original dataset with the new (filtered) one. It works, but I think there's a more efficient way.
Thank's for your help much appreciated.
If there are some numbers missing in Number, I could think of three ways for handling those missing values:
Set them to NA and use prod(..., na.rm = TRUE) to remove them in the calculation of the product.
Set them to 0 and use prod(..., na.rm = FALSE) to at least inflate the product by a factor of 10.
If you want to preserve some mean (arithmetic or geometric) of the factors (10+a_i), set the missing values to that mean minus 10.
In cases 2 and 3 you can for example give a lower bound for the product: prod >= 10^n. (Taking the logarithm to base 10 on both sides yields log(prod) = sum(log) >= n.) But maybe you want to reserve case 2 for those rows that really have zero dead animals.
Regarding your second point, use mutate with ifelse to flag your wanted dates (instead of filter) and additionally group by this new flag. Then use again mutate instead of summarise.
df %>% mutate(
new = ifelse(Date >= start & Date <= end, 1, NA)
) %>% group_by(
Specie,
new
) %>% mutate(
new = ifelse(!is.na(new), prod(10 + Number, na.rm = FALSE), new)
)

Filter specific values of optional argument with dplyr

I have a dataframe that looks like this :
df <- data.frame(ID = rep(1:10, each = 6),
Site = rep(c("A","B","C","D"), each = 6, times = 10),
Department = rep(c("E","F","G","H"), each = 6, times = 10),
Occupation = rep(c("I","J","K","L"), each = 6, times = 10),
Construct = rep(paste0("X",1:6), times = 10),
Score = sample(c("Green","Orange","Red"), size = 60, replace = TRUE))
head(df)
Basically, each ID belongs to a site, a department and has an occupation, and is evaluated on six constructs.
I have adapted a previous function of mine to compute the N and the rate of each Score category for a given Construct, by any combination of Site, Department and Occupation :
my_function <- function(..., dimension = NULL){
df %>%
filter(Construct == dimension) %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
This works perfectly, as I simply have to indicate which Construct, and add any of the three factors (Site, Departement, Occupation) as optional arguments to obtain a summary. For example, a summary of X1 by Site and Department would be :
my_function(dimension = "X1", Site, Department)
However, I would like to filter out some of the values of the Occupation variable, but only when looking at a summary including this variable. I tried to do so by checking whether Occupation was passed as an optional argument, and exclude the specific values when it was the case. Something like :
my_function <- function(..., dimension = NULL){
if(hasArg(Occupation)){
df %>%
filter(Construct == dimension, Occupation != "I") %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
} else {
df %>%
filter(Construct == dimension) %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
}
But it does not seem to work, as it consistently returns includes the values I'd like to filter out, even when I specify Occupation as an optional argument. I tried to fiddle with things like curly-curly {{}} but I can't seem to get this function to filter the specific values.
hasArg seems to expect all of the arguments to be named, whereas in
my_function(dimension="X1", Site, Department, Occupation)
this is not the case.
Perhaps:
my_function <- function(..., dimension = NULL){
hasOcc <- "Occupation" %in% as.character(match.call())
if (hasOcc) {
df %>%
filter(Construct == dimension, Occupation != "I") %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
} else {
df %>%
filter(Construct == dimension) %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
}
my_function(Site, Department, Occupation, dimension = "X1")
# # A tibble: 7 x 6
# # Groups: Site, Department, Occupation [3]
# Site Department Occupation Score n rate
# <chr> <chr> <chr> <chr> <int> <dbl>
# 1 B F J Green 6 0.6
# 2 B F J Orange 4 0.4
# 3 C G K Green 2 0.2
# 4 C G K Orange 2 0.2
# 5 C G K Red 6 0.6
# 6 D H L Green 6 0.6
# 7 D H L Orange 4 0.4
Some other thoughts on the function:
reaching out of its scope to get df is not a good practice: it is not really reproducible, and it can be difficult to troubleshoot. For instance, if you forget to assign your data to df, you'll see
my_function(Site, Department, Occupation, dimension = "X1")
# Error in UseMethod("filter") :
# no applicable method for 'filter' applied to an object of class "function"
(This error is because it is finding stats::df.)
Further, if you want to use it against a different non-df-named dataset, you're out of luck.
Recommendation: explicitly pass the data. A tidyverse commonality is to pass it as the first argument. One side-benefit of this is that you can (generally) use this in the middle of a %>%-pipe directly.
my_function <- function(.data, ..., dimension = NULL) { .data %>% ... }
You can reduce the number of pipelines in there by including the Occupation conditional directly in the filter(..). This is not just code-golf: in more complex code examples, it's not hard to imagine updating one of the %>%-pipes and either forgetting the other or updating it differently. Since the only difference here is a component of filter, we can add it there:
my_function <- function(..., dimension = NULL){
hasOcc <- "Occupation" %in% as.character(match.call())
df %>%
filter(Construct == dimension, !hasOcc | Occupation != "I") %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
If dimension is required, don't default to NULL since, if omitted, this will produce an error.
my_function <- function(.data, ..., dimension) { ... }
If it is instead optional and you don't want to filter on it if not provided, then you need to check for that in your filter:
filter(if (is.null(dimension)) TRUE else Construct == dimension, ...)
If you can imagine wanting dimension to be either NA (matching an explicit NA value in the data) or you might want "one or more", then you may want to use %in% instead of ==:
NA == NA
# [1] NA
NA %in% NA
# [1] TRUE
So your function could use
filter(if (is.null(dimension)) TRUE else Construct %in% dimension, ...)
These points would result in your function being either
my_function <- function(.data, ..., dimension = NULL){
hasOcc <- "Occupation" %in% as.character(match.call())
.data %>%
filter(if (is.null(dimension)) TRUE else Construct %in% dimension, !hasOcc | Occupation != "I") %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
if dimension is optional, or
my_function <- function(.data, ..., dimension) {
hasOcc <- "Occupation" %in% as.character(match.call())
.data %>%
filter(Construct %in% dimension, !hasOcc | Occupation != "I") %>%
group_by(..., Score) %>%
summarise(n = n()) %>%
mutate(rate= round(n/sum(n),2))
}
otherwise.

Select columns and group by columns in function arguments

I am trying to write a function so that I can input any columns to be described both at the overall level and by a grouping variable.
However, I am having trouble with getting output for grouped results.
My data:
df <- data.frame(gender=c("m", "f", "m","m"), age=c("18-22","23-32","23-32","50-60"), income=c("low", "low", "medium", "high"), group=c("A", "A", "B", "B"))
> df
gender age income group
1 m 18-22 low A
2 f 23-32 low A
3 m 23-32 medium B
4 m 50-60 high B
Function:
library(dplyr)
make_sum <- function(data=df, cols, group_var) {
data %>% dplyr::select(cols) %>%
# print tables with frequency and proportions
apply(2, function(x) {
n <- table(x, useNA = "no")
prop=round(n/length(x[!is.na(x)])*100,2)
print(cbind(n, prop))
})
# print tables by group
data %>% dplyr::select(cols, vars(group_var)) %>%
apply(2, function(x) {
n <- table(x, vars(group_var),useNA = "no")
print(n)
})
}
cols <- df %>% dplyr::select(gender,age, income) %>% names()
make_sum(data=df, cols=cols, group_var="group")
I get the proper output for the overall tables but not the grouped, with this error showing:
Error: `vars(group_var)` must evaluate to column positions or names, not a list
Desired output (example) for grouped gender variable:
A B
f 1 0
m 1 2
Instead of using the apply with MARGIN = 2, summarise_all can be called here. Also, the vars wrapped is applied along with a tidyverse function. Here, inorder to get the frequency, an option is to subset the column with [[ which is more direct. Also, as summarise returns only a single row (for each group - if there is grouping variable), we can wrap the output in a list
make_sum <- function(data=df, cols, group_var) {
data %>%
dplyr::select(cols) %>%
summarise_all(~ {
n <- table(., data[[group_var]], useNA = "no")
#list(round(n/length(.[!is.na(.)])*100,2))
list(n)
})
}
cols <- df %>%
dplyr::select(gender,age, income) %>%
names()
out <- make_sum(data=df, cols=cols, group_var="group")
out$gender
#[[1]]
#. A B
# f 1 0
# m 1 2

Calling recursive functions in R

Assuming I have a dataframe, df with this info
group wk source revenue
1 1 C 100
1 1 D 200
1 1 A 300
1 1 B 400
1 2 C 500
1 2 D 600
I'm trying to programatically filter's down to rows of unique combinations of group, wk and source, and then perform some operations on them, before combining them back into another dataframe. I want to write a function that can scale to any number of segments (and not just the example scenario here) and filter down rows. All I need to pass would be the column names by which I want to segment
eg.
seg <- c("group", "wk", "source")
One unique combination to filter rows in df would be
df %>% filter(group == 1 & wk == 1 & source == "A")
I wrote a recursive function (get_rows) to do so, but it doesn't seem to do what I want. Could anyone provide inputs on where I'm going wrong ?
library(dplyr)
filter_row <- function(df,x)
{
df %>% filter(group == x$group & wk == x$wk & source == x$source)
}
seg <- c("group", "wk", "source")
get_rows <- function(df,seg,pos = 1, l = list())
{
while(pos <= (length(seg) + 1))
{
if(pos <= length(seg))
for(j in 1:length(unique(df[,seg[pos]])))
{
k <- unique(df[,seg[pos]])
l[seg[pos]] <- k[j]
get_rows(df,seg,pos+1,l)
return()
}
if(pos > length(seg))
{
tmp <- df %>% filter_row(l)
<call some function on tmp>
return()
}
}
}
get_rows(df,seg)
EDIT: I understand there are prebuilt methods I can use to get what I need, but I'm curious about where I'm going wrong in the recursive function I wrote.
There might be a data.table/dplyr solution out there, but this one is pretty simple.
# Just paste together the values of the column you want to aggregate over.
# This creates a vector of factors
f <- function(data, v) {apply(data[,v,drop=F], 1, paste, collapse = ".")}
# Aggregate, tapply, ave, and a few more functions can do the same thing
by(data = df, # Your data here
INDICES = f(df, c("group", "wk", "source")), # Your data and columns here
FUN = identity, simplify = F) # Your function here
Can also use library(dplyr) and library(data.table)
df %>% data.table %>% group_by(group, wk, source) %>% do(yourfunctionhere, use . for x)

Resources