This is what I have right now:
final.df <- all.df %>% dplyr::arrange(customer, date) %>%
dplyr::select(year, week, customer, date, ltv_score,
avg_monthly_sales:most_recent_login) %>%
group_by(customer, year) %>%
dplyr::mutate_at(c("date"), list(~lead), n = 1) %>%
data.frame()
I am trying to offset everything by one date within the year/customer to backtest predictions - basically have a rolling input of of each group. I found this snippet elsewhere and modified it for what I need but am getting the following error:
Error: Input must be a vector, not a <formula> object.
Note that all the _at, _all, _if verbs are deprecated in favour of across. For a single column you don't need mutate_at/across.
library(dplyr)
final.df <- all.df %>%
dplyr::arrange(customer, date) %>%
dplyr::select(year, week, customer, date, ltv_score,
avg_monthly_sales:most_recent_login) %>%
group_by(customer, year) %>%
dplyr::mutate(date = lead(date)) %>%
data.frame()
Related
I have 2 codes that manipulate and filter (by date) my data.frame and that work perfectly. Now I want to run the code for not only one day, but for every day in vector:
seq(from=as.Date('2020-03-02'), to=Sys.Date(),by='days')` #.... 538 days
The code I want to run for all the days between 2020-03-02 and today is:
KOKOKO <- data.frame %>%
filter(DATE < '2020-03-02')%>%
summarize(DATE = '2020-03-02', CZK = sum(Objem.v.CZK,na.rm = T)
STAVPTF <- data.frame %>%
filter (DATE < '2020-03-02')%>%
group_by(CP) %>%
summarize(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), DATE = '2020-03-02') %>%
select(DATE,CP,mnozstvi) %>%
rbind(KOKOKO)%>%
drop_na() %>%
So instead of '2020-03-02' I want to fill in all days since '2020-03-02' one after another. And each of the KOKOKO and STAVPTF created for the unique day like this I want to save as a separate data.frame and all of them store in a list.
We could use map to loop over the sequence and apply the code
library(dplyr)
library(purrr)
out <- map(s1, ~ data.frame %>%
filter(DATE < .x)%>%
summarize(DATE = .x, CZK = sum(Objem.v.CZK,na.rm = TRUE))
As this is repeated cycle, a function would make it cleaner
f1 <- function(dat, date_col, group_col, Objem_col, aktualni_col, date_val) {
filtered <- dat %>%
filter({{date_col}} < date_val)
KOKOKO <- filtered %>%
summarize({{date_col}} := date_val,
CZK = sum({{Objem_col}}, na.rm = TRUE)
STAVPTF <- filtered %>%
group_by({{group_col}}) %>%
summarize(mnozstvi = last({{aktualni_col}}),
{{date_col}} := date_val) %>%
select({{date_col}}, {{group_col}}, mnozstvi) %>%
bind_rows(KOKOKO)%>%
drop_na()
return(STAVPTF)
}
and call as
map(s1, ~ f1(data.frame, DATE, CP, Objem.v.CZK, AKTUALNI_MNOZSTVI_AKCIE, !!.x))
where
s1 <- seq(from=as.Date('2020-03-02'), to=Sys.Date(), by='days')
It would be easier to answer your question, if you would provide a minimal reproducible example. It's easy done with tidyverses reprex packages
However, your KOKOKO code can be rewritten as simple cumulative sum:
KOKOKO =
data.frame %>%
arrange(DATE) %>% # if necessary
group_by(DATE) %>%
summarise(CZK = sum(Objem.v.CZK), .groups = 'drop') %>% # summarise per DATE (if necessary)
mutate(CZK = cumsum(CZK) - CZK) # cumulative sum excluding current row (current DATE)
Even STAVPTF code can probably be rewritten without iterations. First find the last value of AKTUALNI_MNOZSTVI_AKCIE per CP and DATE. Then this value is assigned to the next DATE:
STAVPTF <-
data.frame %>%
group_by(CP, DATE) %>%
summarise(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), .groups='drop_last') %>%
arrange(DATE) %>% # if necessary
mutate(DATE = lead(DATE))
I wanted to view the transformed dataframe using the flights dataset but R shows an invalid caption argument error
library(dplyr)
library(nycflights13)
view (temp <- flights %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r)))
Error in View : invalid caption argument
However, this one with the piping operator works fine.
(temp <- flights %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))) %>% view()
So does this one (with a capital V)
View (temp <- flights %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r)))
Even this one (where the transformed dataframe is not assigned to an object works)
view (flights %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r)))
Could anyone explain what's happening and why the error in the first case and not the other three? Thank you in advance.
I'd like to see if someone can provide a tidy version of the following problem (ideally in a pipe or something that does not require a loop). I am trying to take a date column, and from it extract a nested list, with each containing 24 dates (sequential).
library(tidyverse)
library(lubridate)
df <-read_csv("https://raw.githubusercontent.com/Nicktz/ExDat/master/extdata/findata.csv") %>% gather(Stock, Price, -Date) %>%
filter(Date <= ymd(20070501)) %>% mutate(Ret = Price / lag(Price) - 1)
DateCol <- df %>% pull(Date) %>% unique
# Roll Window
Roll_Window <- list()
Min_Window <- 24
for( i in Min_Window:length(DateCol)){
Roll_Window[c(i-Min_Window)+1] <-
list(DateCol[c(i-Min_Window+1):i])
}
I would like to create a set of columns based on papers count for each number of year, therefore filtering multiple conditions in dplyr through summarise:
This is my code:
words_list <- data %>%
select(Keywords, year) %>%
unnest_tokens(word, Keywords) %>%
filter(between(year,1990,2017)) %>%
group_by(word) %>%
summarise(papers_count = n()) %>%
arrange(desc(papers_count))
The code above gives me two columns, 'word' and 'papers_count', I would like to create more columns like papers_count (papers_count1990, papers_count1991, etc..) based on each year between 1990 and 2017.
I Am looking for something like ths:
words_list <- data %>%
select(Keywords, year) %>%
unnest_tokens(word, Keywords) %>%
filter(between(year,1990,2017)) %>%
group_by(word) %>%
summarise(tot_papers_count = n(), papers_count_1991 = n()year="1991", ...) %>%
arrange(desc(papers_count))
please does anybody have any suggestion?
I would suggest adding year to the group_by, and then using spread to create multiple summary columns.
library(tidyr)
words_list_by_year <- data %>%
select(Keywords, year) %>%
unnest_tokens(word, Keywords) %>%
filter(between(year,1990,2017)) %>%
group_by(year,word) %>%
summarise(papers_count = n()) %>%
spread(year,papers_count,fill=0)
I've got a data frame (df) with two variables, site and purchase.
I'd like to use dplyr() to group my data by site and purchase, and get the counts and percentages for the grouped data. I'd however also like the tibble to feature rows called ALLSITES, representing the data of all the sites grouped by purchase, so that I end up with a tibble looking similar to dfgoal.
The problem's that my current code doesn't get me the ALLSITES rows. I've tried adding a base R function into dplyr(), which doesn't work.
Any help would be much appreciated.
Starting point (df):
df <- data.frame(site=c("LON","MAD","PAR","MAD","PAR","MAD","PAR","MAD","PAR","LON","MAD","LON","MAD","MAD","MAD"),purchase=c("a1","a2","a1","a1","a1","a1","a1","a1","a1","a2","a1","a2","a1","a2","a1"))
Desired outcome:
dfgoal <- data.frame(site=c("LON","LON","MAD","MAD","PAR","ALLSITES","ALLSITES"),purchase=c("a1","a2","a1","a2","a1","a1","a2"),bin=c(1,2,6,2,4,11,4),pin_per=c(33.33333,66.66667,75.00000,25.00000,100.00000,73.33333,26.66666))
Current code:
library(dplyr)
df %>%
group_by(site, purchase) %>%
summarize(bin = sum(purchase==purchase)) %>%
group_by(site) %>%
mutate(bin_per = (bin/sum(bin)*100))
df %>%
rbind(df, transform(df, site = "ALLSITES") %>%
group_by(site, purchase) %>%
summarize(bin = sum(purchase==purchase)) %>%
group_by(site) %>%
mutate(bin_per = (bin/sum(bin)*100))
We can start from the first output code block, after grouping by 'site' with a created string of 'ALLSITES' and 'purchase' get the sum of 'bin' and later 'bin_per', then with bind_rows row bind the two datasets
df1 %>%
ungroup() %>%
group_by(site = 'ALLSITES', purchase) %>%
summarise(bin = sum(bin)) %>%
ungroup %>%
mutate(bin_per = 100*(bin/sum(bin))) %>%
bind_rows(df1, .)