I'm trying to find the Start and End date of a product's sales period from a column of data that is a dummy variable for sale. Here is a proxy of the type of data that I am working with:
The result I am looking for is:
The actual data set I am working on is much larger than this and does not necessarily look at just 2010-01 to 2011-12.
Thank you!
This assumes only one sale per product
require(tidyverse)
df <- data.frame(product = 'Product A',
month = seq(as.Date('2010-01-01'),
as.Date('2010-10-01'),
by = 'month'
),
onSale = c(rep(0,3), rep(1,4),rep(0,3))
)
df %>%
group_by(product) %>%
summarise(saleStart = month[which.min(month[onSale == 1])],
salend = month[which.max(month[onSale == 1])]
)
Edit:
df <- data.frame(product = 'Product A',
month = seq(as.Date('2010-01-01'),
as.Date('2011-09-01'),
by = 'month'
),
onSale = c(rep(0,3), rep(1,4),rep(0,3), rep(1,4),rep(0,3), rep(1,4))
)
df %>%
group_by(product) %>%
mutate(diff = c(0,diff(onSale))) %>%
group_by(product, diff) %>%
filter(diff == 1) %>%
mutate(monthStart = month, monthEnd = month %m+% months(1)) %>%
select(-month,-diff)
Related
The following loop is effective in that it gets me to the finish line but i'm looking for a way to make it more efficient as I'm looping through a large dataset. Possibly using a Purrr function?
library(tidyverse)
library(timetk)
#### CREATE DATA
df_1 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_1")
df_2 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_2")
df <- rbind(df_1, df_2) %>%
group_by(product) %>%
timetk::future_frame(
.date_var = Date,
.length_out = "12 months",
.bind_data = TRUE
)
Here I'm creating a date sequence to iterate through the for loop
#### CREATE DATE SEQUENCE
Dates <- seq(min(df$Date) %m+% months(36), min(df$Date) %m+% months(48), by = "month")
The dates from the sequence above will iterate through the loop to fill in the future data and then I join, rename some columns, and drop all that contain ("y")... Seems like I'm performing some steps that aren't necessary.
for (i in 1:length(Dates)){
df <- df %>%
mutate(Purchases = case_when(Date < Dates[i] ~ Purchases,
Date == Dates[i] ~ lag(Purchases, 12)*1.05,
TRUE ~ 0
)) %>%
mutate(Starting_Inventory = case_when(Date < Dates[i] ~ Starting_Inventory,
Date == Dates[i] ~ lag(Ending_Inventory,1),
TRUE ~ 0
)) %>%
mutate(Sales = case_when(Date < Dates[i] ~ Sales,
Date == Dates[i] ~ lag(Sales,12) * 1.15,
TRUE ~ 0
)) %>%
mutate(Ending_Inventory = case_when(Date < Dates[i] ~ Ending_Inventory,
Date == Dates[i] ~ Starting_Inventory + Sales + Purchases,
TRUE ~ 0
)) %>%
mutate(Inventory = case_when(Date < Dates[i] ~ Inventory,
Date == Dates[i] ~ Ending_Inventory,
TRUE ~ 0
))
new_data <- df[df$Date == (Dates[i]),]
df <- df %>%
left_join(., new_data, by = c("product", "Date")) %>%
mutate(Inventory.x = ifelse(Date == Dates[i],Inventory.y,Inventory.x),
Purchases.x = ifelse(Date == Dates[i],Purchases.y,Purchases.x),
Sales.x = ifelse(Date == Dates[i],Sales.y,Sales.x),
Starting_Inventory.x = ifelse(Date == Dates[i],Starting_Inventory.y,Starting_Inventory.x),
Ending_Inventory.x = ifelse(Date == Dates[i],Ending_Inventory.y,Ending_Inventory.x),
) %>%
rename(Inventory = Inventory.x,
Purchases = Purchases.x,
Starting_Inventory = Starting_Inventory.x,
Sales = Sales.x,
Ending_Inventory = Ending_Inventory.x) %>%
dplyr::select(-contains(".y"))
return
print(i)
gc()
}
There are a lot of unnecessary steps in there.
Mutate can take more than one expression at once.
The case_when is unnecessary since in the next step you only keep the rows that got modified.
Then, for the same reason, the join and renaming is more steps than needed, you can just replace the old rows with the new row by selecting a subset.
for (i in seq_along(Dates)){
new_data <- df2 %>%
mutate(Purchases = lag(Purchases, 12)*1.05,
Starting_Inventory = lag(Ending_Inventory,1),
Sales = lag(Sales,12) * 1.15,
Ending_Inventory = Starting_Inventory + Sales + Purchases,
Inventory = Ending_Inventory)
df2[df2$Date == Dates[i],] <- new_data[new_data$Date == Dates[i],]
}
But then you're stil recalculating your whole data.frame for each loop. No need for that either since mutate() is iterative. You can do it all with just that function.
Also, since there are only 2 conditions really needed, you can replace the case_when with ifelse and it's faster.
df <- df %>%
mutate(
Purchases = ifelse(
Date %in% Dates, lag(Purchases, 12)*1.05, Purchases
),
Starting_Inventory = ifelse(
Date %in% Dates, lag(Ending_Inventory,1), Starting_Inventory
),
Sales = ifelse(
Date %in% Dates, lag(Sales,12) * 1.15, Sales
),
Ending_Inventory = ifelse(
Date %in% Dates, Starting_Inventory + Sales + Purchases,
Ending_Inventory
),
Inventory = ifelse(
Date %in% Dates, Ending_Inventory, Inventory
)
)
Edit:
I think it's important to break down what you're trying to do when you end up with long for loop like this. Since you're trying to do in place modifications, even in base R, you could do this with this short a for loop :
df3 <- df.o
df3 <- df3 |> within({
for (i in which(Date %in% Dates)){
Purchases[i] = Purchases[i-12]*1.05
Sales[i] = Sales[i-12] * 1.15
Ending_Inventory[i] = Starting_Inventory[i] + Sales[i] + Purchases[i]
Inventory[i] = Ending_Inventory[i]
Starting_Inventory[i] = Ending_Inventory[i-1]
}
i = NULL
})
A bit slower than mutate, but it's the same logic.
I am trying to filter out a data set into two months. I would like to filter out the ID and year that have data, and to remove the ID and year that do not have an associated pair.
For example if an ID and year has both the January and July month in the data set, I would like to include this ID and the year in my filtered data. If an ID has only the month of January and not July, I would like to remove this data and not include it in the filtered data set. Is there a good way to do this? Just a note that I wasn't sure how to simulate the uneven data set in the example.
After filtering for my desired output, I test by creating a list for each seasonal month where each ID and year has at least 15 rows associated with it.
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year)
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month %in% 7) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(month %in% 1) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
You were really close. I think your filter can be simplified to the following. Just be sure to save it to df.
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
On summer_list and winter_list, add a group_by between the filters. With the dataset you provided, there were no groups with 15 records, but I tested that this works by bumping up the size of df until I got some.
summer_list <- df %>%
filter(month == 7) %>% # used == since there's only one test value
group_by(ID, year) %>% # added this
filter(n() >= 15) %>%
group_split()
There's also a typo in your first use of winter_list -- the input data is df1, but I think you want df. Hope this works!
Here's the complete code including the larger df:
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
4000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 800),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month == 7) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df %>%
filter(month == 1) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
Is there a way to use the filtering function inside the summarise for example:
#I have a dataset with 5 columns Price, Type, Amount USD, date
# I only want the mean price of the rows that are not Type == "SELL"
data = summarise(Price = filter(!Type == "SELL") %>% mean(Price), Amount = sum(Amount), USD = sum(USD), date = min(date))
You can use :
library(dplyr)
data %>%
summarise(Price = mean(Price[Type != 'SELL']),
Amount = sum(Amount),
USD = sum(USD),
date = min(date))
To use it in pipes :
data %>%
summarise(Price = mean(data %>% filter(Type != 'SELL') %>% pull(Price)),
Amount = sum(Amount),
USD = sum(USD),
date = min(date))
I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))
Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)
If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)
I'm trying to filter intraday-data to include only certain period inside the day. Is there a trick in some packages to achieve this. Here is example data:
library(tibbletime)
example <- as.tibble(data.frame(
date = ymd_hms(seq(as.POSIXct("2017-01-01 09:00:00"), as.POSIXct("2017-01-02 20:00:00"), by="min")),
value = rep(1, 2101)))
I would like to include only 10:00:00 - 18:35:00 for each day, but can't achieve this nicely. My solution for now has been creating extra indic columns and then filter by them, but it hasn't worked well either.
You can use the function between() from data.table
example[data.table::between(format(example$date, "%H:%M:%S"),
lower = "10:00:00",
upper = "18:35:00"), ]
library(tibbletime)
library(tidyverse)
library(lubridate)
example <- as.tibble(data.frame(
date = ymd_hms(seq(as.POSIXct("2017-01-01 09:00:00"), as.POSIXct("2017-01-02 20:00:00"), by="min")),
value = rep(1, 2101)))
example %>%
mutate(time = as.numeric(paste0(hour(date),".",minute(date)))) %>%
filter(time >= 10 & time <= 18.35) %>%
select(-time)
This is pretty hacky but if you really want to stay in the tidyverse:
rng <- range((hms("10:00:00") %>% as_datetime()), (hms("18:35:00") %>% as_datetime()))
example %>%
separate(., date, into = c("date", "time"), sep = " ") %>%
mutate(
time = hms(time) %>% as_datetime(),
date = as_date(date)
) %>%
filter(time > rng[1] & time < rng[2]) %>%
separate(., time, into = c("useless", "time"), sep = " ") %>%
select(-useless)