Update columns by joining more than one columns - r

I have two tables and I need to update pro_sales column values in the first table from pro_sales values in the second.
df1 <- data.frame(storecode = c(100,100,100,200,200),
productcode = c(1,2,3,1,2), pro_sales = c(0,0,0,0,0))
df2 <- data.frame(storecode = c(100,100,200),
productcode = c(1,2,1), pro_sales = c(0,1,0))
I need to left join on the columns storecode and productcode. Below should be my final table:
storecode productcode pro_sales
1 100 1 0
2 100 2 1
3 100 3 0
4 200 1 0
5 200 2 0
I was able to left join in dplyr but after that i need help please?
df1 %>%
left_join(df2,c("storecode"="storecode","productcode"="productcode")) %>%
mutate( ???? ) %>%
select(names, match, value = value.x)
Thank you.

Another option is to use an update join with the data.table-package:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(storecode, productcode), pro_sales := i.pro_sales][]
which gives:
storecode productcode pro_sales
1: 100 1 0
2: 100 2 1
3: 100 3 0
4: 200 1 0
5: 200 2 0

df1 <- data.frame(storecode=c(100,100,100,200,200),
productcode=c(1,2,3,1,2),pro_sales=c(0,0,0,0,0))
df2 <- data.frame(storecode=c(100,100,200),
productcode=c(1,2,1),pro_sales=c(0,1,0))
library(dplyr)
df1 %>%
left_join(df2, by = c("storecode","productcode")) %>%
mutate(pro_sales.y = coalesce(pro_sales.y, 0)) %>%
select(storecode, productcode, pro_sales = pro_sales.y)
# storecode productcode pro_sales
# 1 100 1 0
# 2 100 2 1
# 3 100 3 0
# 4 200 1 0
# 5 200 2 0
I assume that if you want to update values in first table given the second table, as you mentioned, then NA values should be zeros and not what you have in your first table.

Related

Merging two tables considering the range of dates from the second table using R

I have two dataframes:
df_1 <- data.frame(DATES_1=c("12-30-2022", "03-02-2023","05-05-2023"),
DV01_1=c(1,2,3))
df_2 <- data.frame(DATES_2=c("01-01-2023", "02-01-2023", "05-01-2023","06-01-2023"),
DV01_2=c(1,2,3,4))
I want to join the DV01_1 values from df_1 into df_2 respecting the range of dates in DATES_2. the final answer should be this dataframe:
df_3 <- data.frame(DATES_2=c("01-01-2023", "02-01-2023", "05-01-2023","06-01-2023"),
DV01_2=c(1,2,3,4), DV01_1=c(0,0,2,3))
So for example, the last element from df_1 should consider that
01/01/23,02/01/23,05/01/23 < DATES_1 < 06/01/23.
This process can be easily replicated using excel formula SUMIFS, for example.
We may use a non-equi join - Convert the dates to date class in each of the datasets, then shift the DATES_2 to get the previous date as starting date in the range, do a join with the df_1 using a non-equi join to create the DV01_1 column in the df_2
library(data.table)
df_2$DATES_2 <- as.Date(df_2$DATES_2, "%m-%d-%Y")
df_1$DATES_1 <- as.Date(df_1$DATES_1, "%m-%d-%Y")
setDT(df_2)[, st := shift(DATES_2, fill = first(DATES_2))]
df_2[df_1, DV01_1 := DV01_1,
on = .(st <= DATES_1, DATES_2 > DATES_1), mult = "last"]
df_2[, st := NULL][is.na(DV01_1), DV01_1 := 0]
-output
> df_2
DATES_2 DV01_2 DV01_1
1: 2023-01-01 1 0
2: 2023-02-01 2 0
3: 2023-05-01 3 2
4: 2023-06-01 4 3
OP's expected output
> df_3
DATES_2 DV01_2 DV01_1
1 01-01-2023 1 0
2 02-01-2023 2 0
3 05-01-2023 3 2
4 06-01-2023 4 3
Or this can be done with the devel version of dplyr
library(dplyr)
library(lubridate)
library(tidyr)
df_1 %>%
mutate(DATES_1 = mdy(DATES_1)) %>%
left_join(df_2 %>%
mutate(DATES_2 = mdy(DATES_2),
st = lag(DATES_2, default = first(DATES_2))), .,
by = join_by(closest(st <= DATES_1),
closest(DATES_2 > DATES_1))) %>%
select(names(df_2), DV01_1) %>%
mutate(DV01_1 = replace_na(DV01_1, 0))
-output
DATES_2 DV01_2 DV01_1
1 2023-01-01 1 0
2 2023-02-01 2 0
3 2023-05-01 3 2
4 2023-06-01 4 3

break long characters by "-" and identify the unique components by group

Suppose you have a transaction data set with products purchased by customers. You know the total skus (which unique product they buy for each order). But you want to know the unique sku for the lifetime of the user. Say, I buy "apple" for the first time ever today, then apple is a NEW sku. But when I buy "apple" amd "banana" again. Then apple is not a new sku but banana is (if first time purchase).
data
user_id<-c(1,1,1,2,3,4,4)
order_date<-c("2/9/2016",
"11/19/2015",
"12/30/2016",
"9/27/2016",
"12/10/2016",
"11/5/2016",
"1/1/2017")
sku<-c("262-264-280","280-123","510","6251-16990","9227-14572","9227-14572","280")
dt<-data.frame(user_id,order_date,sku)
Output
update: I typed "user_id" as "order_id"
A data.table possibility, using the strings split by -, and checking for new unique values in each row via a set union and Reduce(..., accumulate=TRUE). The count of new values is then a difference between each successive row:
library(data.table)
setDT(dt)
dt[, sku := as.character(sku)]
dt[,
total := lengths(Reduce(union, strsplit(sku, "\\-"), accumulate=TRUE)),
by=user_id
]
dt[, new := c(total[1], diff(total)), by=user_id]
dt
# user_id order_date sku total new
#1: 1 2/9/2016 262-264-280 3 3
#2: 1 11/19/2015 280-123 4 1
#3: 1 12/30/2016 510 5 1
#4: 2 9/27/2016 6251-16990 2 2
#5: 3 12/10/2016 9227-14572 2 2
#6: 4 11/5/2016 9227-14572 2 2
#7: 4 1/1/2017 280 3 1
A possible tidyverse way:
library(dplyr)
library(tidyr)
dt %>%
separate_rows(sku, sep = "-") %>%
mutate(order_date = as.Date(as.character(order_date), "%m/%d/%Y")) %>%
group_by(order_id, sku) %>%
arrange(order_id, order_date) %>%
mutate(idx = row_number() * (row_number() == 1)) %>%
group_by(order_id, order_date) %>%
summarise(sku = paste(sku, collapse = "-"),
`number of new sku purchase` = sum(idx)) %>%
group_by(order_id) %>%
mutate(`total number of sku` = cumsum(`number of new sku purchase`))
Here's a base solution that is similar to #thelatemail:
dt$sku <- as.character(dt$sku)
dt$n_skus <- ave(dt$sku
, dt$user_id
, FUN = function (sku_f) {
sapply(
Reduce(union, strsplit(sku_f, '-', fixed = T), accumulate = T)
, length)
})
dt$n_skus <- as.integer(dt$n_skus)
dt$n_new <- ave(dt$n_skus
, dt$user_id
, FUN = function(n) c(n[1], diff(n)))
dt
user_id order_date sku n_skus n_new
1 1 2/9/2016 262-264-280 3 3
2 1 11/19/2015 280-123 4 1
3 1 12/30/2016 510 5 1
4 2 9/27/2016 6251-16990 2 2
5 3 12/10/2016 9227-14572 2 2
6 4 11/5/2016 9227-14572 2 2
7 4 1/1/2017 280 3 1
And here's a crazy, inefficient data.table solution that doesn't include the original sku field:
library(data.table)
setDT(dt)
dt[, strsplit(as.character(sku), '-'), by = .(user_id, order_date)
][, .SD[!duplicated(V1), .(n_new = .N), by = order_date], by = .(user_id)
][, n_total := cumsum(n_new), by = .(user_id)][]
Performance
#thelatemail's should scale up the best.
Unit: microseconds
expr min lq mean median uq max neval
argonaut_dplyr 10020.601 10247.851 10720.0709 10474.451 10770.751 16021.3 100
thelatemail_dt 1954.501 2072.101 2385.8019 2303.001 2436.202 5807.0 100
#base is always fastest with 7 rows, it would be outstripped by #thelatemail very quickly
cole_base 651.501 751.601 834.0299 772.201 794.752 6309.2 100
cole_bad_dt 7006.400 7355.200 7757.9991 7668.401 7958.651 12708.0 100
purrr_A_sul 14575.501 14862.552 15566.4809 15033.201 15401.601 33263.3 100
Here is one option based on unlist(strsplit(dt$sku,'-')) the current sku then compare it with the previous skus
library(dplyr)
library(purrr)
library(tidyr)
dt %>%
nest(-user_id) %>%
mutate(NNSP = map(data, ~map_dbl(1:length(.x$sku), function(y) {
#browser()
ynow <- unlist(strsplit(as.character(.x$sku)[y],'-'))
yprev <- unique(unlist(strsplit(as.character(.x$sku[1:y-1]),'-')))
length(ynow[!ynow %in% yprev])
})),
TotNNSP = map(NNSP, ~cumsum(.x))) %>%
unnest()
# A tibble: 7 x 5
user_id NNSP TotNNSP order_date sku
<dbl> <dbl> <dbl> <fct> <fct>
1 1 3 3 2/9/2016 262-264-280
2 1 1 4 11/19/2015 280-123
3 1 1 5 12/30/2016 510
4 2 2 2 9/27/2016 6251-16990
5 3 2 2 12/10/2016 9227-14572
6 4 2 2 11/5/2016 9227-14572
7 4 1 3 1/1/2017 280
Using #thelatemail's Reduce and lengths combination, we can do:
library(dplyr)
setdiff2 <- function(x, y) y[!y %in% x]
dt %>%
group_by(user_id) %>%
mutate(sku = as.character(sku),
NNSP = lengths(Reduce(setdiff2, strsplit(sku,'-'), accumulate = TRUE)),
Tot_NNSP = cumsum(NNSP))

Find index of first and last occurrence in data table

I have a data table that looks like
|userId|36|37|38|39|40|
|1|1|0|3|0|0|
|2|3|0|0|0|1|
Where each numbered column (36-40) represent week numbers. I want to calculate the number of weeks before the 1st occurrence of a non-zero value, and the last.
For instance, for userId 1 in my dataset, the first value appears at week 36, and the last one appears at week 38, so the value I want is 2. For userId 2 it's 40-36 which is 4.
I would like to store the data like:
|userId|lifespan|
|1|2|
|2|4|
I'm struggling to do this, can someone please help?
General method I would take is to melt it, convert the character column names to numeric, and take the delta by each userID. Here is an example using data.table.
library(data.table)
dt <- fread("userId|36|37|38|39|40
1|1|0|3|0|0
2|3|0|0|0|1",
header = TRUE)
dt <- melt(dt, id.vars = "userId")
dt[, variable := as.numeric(as.character(variable))]
dt
# userId variable value
# 1: 1 36 1
# 2: 2 36 3
# 3: 1 37 0
# 4: 2 37 0
# 5: 1 38 3
# 6: 2 38 0
# 7: 1 39 0
# 8: 2 39 0
# 9: 1 40 0
# 10: 2 40 1
dt[!value == 0, .(lifespan = max(variable) - min(variable)), by = .(userId)]
# userId lifespan
# 1: 1 2
# 2: 2 4
Here's a dplyr method:
df %>%
gather(var, value, -userId) %>%
mutate(var = as.numeric(sub("X", "", var))) %>%
group_by(userId) %>%
slice(c(which.max(value!=0), max(which(value!=0)))) %>%
summarize(lifespan = var[2]-var[1])
Result:
# A tibble: 2 x 2
userId lifespan
<int> <dbl>
1 1 2
2 2 4
Data:
df = read.table(text = "userId|36|37|38|39|40
1|1|0|3|0|0
2|3|0|0|0|1", header = TRUE, sep = "|")

Flag dates based on multiple columns

I have a df, this provides information about the create_date and delete_date(if any) for a given ID.
Structure:
ID create_date1 create_date2 delete_date1 delete_date2
1 01-01-2014 NA NA NA
2 01-04-2014 01-08-2014 01-05-2014 NA
the create_date and delete_date extends till 10, i.e. create_date10
and delete_date10 columns are present
Rules/Logic:
We charge a user on monthly basis, if a user was created on 30th of a month, even then it's treated as if the user was active for a month(very low cost)
If a user has a delete date (irrespective on which date) in this month, then from next month the user is not charged
If a user has only create_date and no delete_date then all dates including the create_month is charged
Output expected:
ID 2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08
1 1 1 1 1 1 1 1 1
2 0 0 0 1 1 0 0 1
so on till current date
1 indicates the user is charged/active for that month
Problem:
I have been struggling to do this, but can't even understand how to do this. My earlier method is a bit too slow
Previous Solution:
Make the dataset into tall
Insert sequence of dates for each ID as a new column
Use a for loop to check the status
for each ID, status is equal to 1,
if create_date is equal to sequence, and it's 0 if the lag(delete_date) is equal to sequence
else is same as lag(status)
ID create_date delete_date sequence status?
1 01-01-2014 NA 2014-01 1
1 01-01-2014 NA 2014-02 1
1 01-01-2014 NA 2014-03 1
may not be that efficient : assuming this is just for a single year(could be extended easily)
# convert all dates to Date format
df[,colnames(df[-1])] = lapply(colnames(df[-1]), function(x) as.Date(df[[x]], format = "%d-%m-%Y"))
# extract the month
library(lubridate)
df[,colnames(df[-1])] = lapply(colnames(df[-1]), function(x) month(df[[x]]))
# df
# ID create_date1 create_date2 delete_date1 delete_date2
#1 1 1 NA NA NA
#2 2 4 8 5 NA
# get the current month
current.month <- month(Sys.Date())
# assume for now current month is 9
current.month <- 9
flags <- rep(FALSE, current.month)
func <- function(x){
x[is.na(x)] <- current.month # replacing all NA with current month(9)
create.columns.indices <- x[grepl("create_date", colnames(df[-1]))] # extract the create_months
delete.columns.indices <- x[grepl("delete_date", colnames(df[-1]))] # extract the delete_months
flags <- pmin(1,colSums(t(sapply(seq_along(create.columns.indices),
function(x){
flags[create.columns.indices[x]:delete.columns.indices[x]] = TRUE;
flags
}))))
flags
}
df1 = cbind(df$ID , t(apply(df[-1], 1, func)))
colnames(df1) = c("ID", paste0("month",1:current.month))
# df1
# ID month1 month2 month3 month4 month5 month6 month7 month8 month9
#[1,] 1 1 1 1 1 1 1 1 1 1
#[2,] 2 0 0 0 1 1 0 0 1 1
Here's a still-pretty-long tidyverse approach:
library(tidyverse)
df %>% gather(var, date, -ID) %>% # reshape to long form
# separate date type from column set number
separate(var, c('action', 'number'), sep = '_date', convert = TRUE) %>%
mutate(date = as.Date(date, '%d-%m-%Y')) %>% # parse dates
spread(action, date) %>% # spread create and delete to two columns
mutate(min_date = min(create, delete, na.rm = TRUE), # add helper columns; use outside
max_date = max(create, delete, na.rm = TRUE)) %>% # variable to save memory if an issue
group_by(ID, number) %>%
mutate(month = list(seq(min_date, max_date, by = 'month')), # add month sequence list column
# boolean vector of whether range of months in whole range
active = ifelse(is.na(create),
list(rep(FALSE, length(month[[1]]))),
lapply(month, `%in%`,
seq.Date(create,
min(delete, max_date, na.rm = TRUE),
by = 'month')))) %>%
unnest() %>% # unnest list columns to long form
group_by(ID, month = format(month, '%Y-%m')) %>%
summarise(active = any(active) * 1L) %>% # combine muliple rows for one ID
spread(month, active) # reshape to wide form
## Source: local data frame [2 x 9]
## Groups: ID [2]
##
## ID `2014-01` `2014-02` `2014-03` `2014-04` `2014-05` `2014-06` `2014-07` `2014-08`
## * <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 1 1 1 1 1 1 1
## 2 2 0 0 0 1 1 0 0 1

Count frequencies and add a total sum

I have a large data.frame containing these values:
ID_Path Conversion Lead Path Week
32342 A25177 1 JEFD 2015-25
32528 A25177 1 EUFD 2015-25
25485 A3 1 DTFE 2015-25
32528 Null 0 DDFE 2015-25
23452 A25177 1 JDDD 2015-26
54454 A25177 1 FDFF 2015-27
56848 A2323 1 HDG 2015-27
I want to be able to create a frequency table that displays a table like this:
Week Total A25177 A3 A2323
2015-25 3 2 1 0
2015-26 1 1 0 0
2015-27 2 1 0 1
Where every unique Conversion has a column, and all the times where the Conversion is Null is the same time as when the Lead is 0.
In this example there is 3 unique conversions, sometimes there is 1, sometimes there are 5 or more. So it should not be limited to only 3.
I have created a new DF containing only Conversion that are not Null
I have tried using data.table with this code:
DF[,list(Week=Week,by=Conversion]
with no luck.
I have tried using plyr with this code:
ddply(DF,~Conversion,summarise,week=week)
with no luck.
I would recommend dropping unnecessary levels in order to not mess the output, and then run a simple table and addmargins combination
DF <- droplevels(DF[DF$Conversion != "Null",])
addmargins(table(DF[c("Week", "Conversion")]), 2)
# Conversion
# Week A2323 A25177 A3 Sum
# 2015-25 0 2 1 3
# 2015-26 0 1 0 1
# 2015-27 1 1 0 2
Alternatively, you could do the same with reshape2 while specifying the margins parameter
library(reshape2)
dcast(DF, Week ~ Conversion, value.var = "Conversion", length, margins = "Conversion")
# Week A2323 A25177 A3 (all)
# 1 2015-25 0 2 1 3
# 2 2015-26 0 1 0 1
# 3 2015-27 1 1 0 2
An alternative solution using dplyr and tidyr:
library(tidyr)
library(dplyr)
dt = data.frame(Conversion = c("A1","Null","A1","A3"),
Lead = c(1,0,1,1),
Week = c("2015-25","2015-25","2015-25","2015-26"))
dt %>%
filter(Conversion != "Null") %>%
group_by(Week, Conversion) %>%
summarise(Lead = sum(Lead)) %>%
ungroup() %>%
spread(Conversion,Lead,fill=0) %>%
group_by(Week) %>%
do(data.frame(.,
Total = sum(.[,-1]))) %>%
ungroup()
# Week A1 A3 Total
# 1 2015-25 2 0 2
# 2 2015-26 0 1 1

Resources