I have a dataset where each row represents a continuous spell with start and end months and years. For spells which are over more than one year, I want to pivot them so that there is one row per year.
Input:
library(data.table)
dat <- data.table(id = c(1,1,2), b_sp_y = c(2008, 2009, 2011), b_sp_m = c(3, 8, 6),
e_sp_y = c(2008, 2010, 2013), e_sp_m = c(5, 1, 9))
id b_sp_y b_sp_m e_sp_y e_sp_m
1: 1 2008 3 2008 5
2: 1 2009 8 2010 1
3: 2 2011 6 2013 9
Here is my truly horrifyingly ugly code:
dat[, y_dif := e_sp_y - b_sp_y]
res <- dat[y_dif == 0][, c("e_sp_y", "y_dif") := NULL]
setnames(res, "b_sp_y", "year")
tmp <- dat[y_dif > 0]
for(i in 1:nrow(tmp)){
foo <- tmp[i, ]
foo2 <- data.table(year = foo$b_sp_y:(foo$b_sp_y + foo$y_dif))[,id := foo$id]
foo2[, b_sp_m := c(foo$b_sp_m, rep(1, foo$y_dif))]
foo2[, e_sp_m := c(rep(12, foo$y_dif), foo$e_sp_m)]
res <- rbind(res, foo2)
}
Output:
id year b_sp_m e_sp_m
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
This is ugly and slow to a crawl, but I couldn't really come up with anything better.
Thanks for your help!
Proceeding by row fill in the three columns using summarize as shown.
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * year, 1, b_sp_m),
e_sp_m = replace(12 + 0 * year, length(year), e_sp_m))
giving:
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
or using only data.table:
library(data.table)
dat[, .(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * b_sp_y:e_sp_y, 1, b_sp_m),
e_sp_m = replace(12 + 0 * b_sp_y:e_sp_y, e_sp_y - b_sp_y + 1, e_sp_m)),
by = 1:nrow(dat)][, -1]
Added
Here are some slightly more compact variations of the above:
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
library(data.table)
dat[, {
year <- b_sp_y:e_sp_y
.(id = id,
year = year,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
},
by = 1:nrow(dat)][, -1]
I'd suggest: make a date sequence for each id/row, group by id and year, summarize first and last month.
library(dplyr); library(lubridate)
dat %>%
mutate(start = ymd(paste(b_sp_y, b_sp_m, "01", sep = "-")),
end = ymd(paste(e_sp_y, e_sp_m, "01", sep = "-"))) %>%
group_by(id, row = row_number()) %>%
summarize(months = seq.Date(start, end, by = "month")) %>%
group_by(id, year = year(months)) %>%
summarize(from = month(min(months)),
to = month(max(months)), .groups = "drop")
Result:
# A tibble: 6 × 4
id year from to
<dbl> <dbl> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
We create a sequence column 'rn', loop over the year columns, get the sequence in a list, unnest the column, and do a group by the 'rn' and replace the 'b', 'e' columns where there are duplicates to 1 and 12 respectively
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
mutate(rn=row_number(),
year = map2(b_sp_y, e_sp_y, `:`),
b_sp_y= NULL,
e_sp_y = NULL) %>%
unnest(year) %>%
group_by(rn) %>%
mutate(b_sp_m = replace(b_sp_m, duplicated(b_sp_m), 1),
e_sp_m = replace(e_sp_m, duplicated(e_sp_m, fromLast = TRUE) &
n() > 1, 12)) %>%
ungroup %>%
select(-rn) %>%
relocate(year, .after = 1)
-output
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
OP's output of 'res'
> res
id year b_sp_m e_sp_m
<num> <num> <num> <num>
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
Related
having a dataframe with sales per customer and months.
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d")
date customer product
1 jan john a
2 jan john b
3 jan john d
4 jan Mary a
5 jan Mary b
6 jan Mary c
7 jan Mary d
8 feb Robert a
9 feb Robert b
10 feb Mary c
11 feb john a
12 feb john c
13 feb Robert c
14 feb Robert d
I need to summarize how many times the same customer is present across months and products.
Expected result:
date a b c d same cust
jan 2 2 1 2 0
feb 2 1 2 0 1
same cust 1 0 1 0
A possible solution:
library(tidyverse)
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d"))
df %>%
pivot_wider(date,names_from=product,values_from=customer,values_fn=length)%>%
bind_cols(SCust = table(df$customer, df$date) %>% apply(2, \(x) sum(x>=2))) %>%
bind_rows(c(tibble(date="SCust"),
table(df$customer, df$product) %>% apply(2, \(x) sum(x>=2))))
#> # A tibble: 3 × 6
#> date a b d c SCust
#> <chr> <int> <int> <int> <int> <int>
#> 1 jan 2 2 2 1 2
#> 2 feb 2 1 1 3 2
#> 3 SCust 1 0 0 1 NA
I don't know about the marginals, but for the main table
library(reshape2)
dcast(
df,
date~product,
function(x){length(unique(x))},
value.var="customer"
)
date a b c d
1 feb 2 1 3 1
2 jan 2 2 1 2
You can try
library(tidyverse)
df %>%
pivot_wider(names_from = product, values_from = customer, values_fn = n_distinct) %>%
bind_rows(
df %>%
count(product, customer) %>%
group_by(product) %>%
summarise(n=sum(n-1),
date = "all") %>%
pivot_wider(names_from = product,values_from=n ))
# A tibble: 3 x 5
date a b d c
<chr> <dbl> <dbl> <dbl> <dbl>
1 jan 2 2 2 1
2 feb 2 1 1 3
3 all 1 0 0 1
dt <- data.frame(stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan", "jan","jan","jan","feb","feb","feb","feb","feb","feb","feb"),
customer = c("john","john","john","Mary", "Mary","Mary","Mary","Robert","Robert","Mary","john","john","Robert","Robert"),
product = c("a","b","d","a","b","c","d","a","b","c","a","c","c","d")
)
library(data.table)
setDT(dt)
setorder(dt, product)
rbindlist(list(
dcast(dt[, .(value = .N), by = .(date, product)], date ~ product),
transpose(dt[, .(same_cust_row = .N - length(unique(customer))), by = .(product)], make.names = "product", keep.names = "date")
))
# date a b c d
# 1: feb 2 1 3 1
# 2: jan 2 2 1 2
# 3: same_cust_row 1 0 1 0
Do you need the "detail" data, or just the summary ("same cust") data?
library(dplyr)
library(tidyr)
library(purrr)
# by month / same customer bought in both months
df %>% pivot_wider(names_from = product, values_from = date, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==2))
$a
[1] 1
$b
[1] 0
$d
[1] 0
$c
[1] 1
# by month / same customer bought all (4) products
z <- df %>% pivot_wider(names_from = date, values_from = product, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==4))
$jan
[1] NA
$feb
[1] 1
I need to convert my data, which is on quarterly basis, to monthly, by dividing some variable by 3.
Example dataset:
df <- data.frame(Year = c(2018,2019,2020), qtr = c(1,3,2),
amount = c(3,6,12), variable = c(5,6,7))
df
What I would need is to get months for every quarter, i.e. the final dataset would look like this:
data.frame(Year = c(2018,2018,2018,2019,2019,2019,2020,2020,2020),
qtr = c(1,2,3,7,8,9,4,5,6),
amount = c(1,1,1,2,2,2,4,4,4),
variable = c(5,5,5,6,6,6,7,7,7))
Also, bonus question, how do I print the data frames in this environment
Does this work:
df %>%
mutate(qtr_start_mth = case_when(qtr == 1 ~ 1,
qtr == 2 ~ 4,
qtr == 3 ~ 7,
qtr == 4 ~ 10),
qtr_end_mth = case_when(qtr == 1 ~ 3,
qtr == 2 ~ 6,
qtr == 3 ~ 9,
qtr == 4 ~ 12)) %>%
mutate(month = map2(qtr_start_mth, qtr_end_mth, `:`)) %>%
separate_rows() %>%
unnest(month) %>%
mutate(amount = amount /3) %>%
select(1,2,3,4,7)
# A tibble: 9 x 5
Year qtr amount variable month
<dbl> <dbl> <dbl> <dbl> <int>
1 2018 1 1 5 1
2 2018 1 1 5 2
3 2018 1 1 5 3
4 2019 3 2 6 7
5 2019 3 2 6 8
6 2019 3 2 6 9
7 2020 2 4 7 4
8 2020 2 4 7 5
9 2020 2 4 7 6
Data used:
> dput(df)
structure(list(Year = c(2018, 2019, 2020), qtr = c(1, 3, 2),
amount = c(3, 6, 12), variable = c(5, 6, 7)), class = "data.frame", row.names = c(NA,
-3L))
>
Using base:
do.call(rbind,
c(make.row.names = FALSE,
lapply(split(df, df$Year), function(i){
cbind(i, month = 1:3 + (i$qtr - 1) * 3, row.names = NULL)
})))
# Year qtr amount variable month
# 1 2018 1 3 5 1
# 2 2018 1 3 5 2
# 3 2018 1 3 5 3
# 4 2019 3 6 6 7
# 5 2019 3 6 6 8
# 6 2019 3 6 6 9
# 7 2020 2 12 7 4
# 8 2020 2 12 7 5
# 9 2020 2 12 7 6
IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2
Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)
You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2
1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)
How do I convert the dataframe?
Before:
set.seed(1)
df <- data.frame( n = rpois(16, 2),
year = rep(2011, 16),
month = rep(seq(1,4,1), times = rep(4,4)))
After:
df1 <- data.frame( n = c(8,11,4,9),
year = rep(2011, 4),
month = rep(seq(1,4,1)))
I think that what you want is this, using dplyr:
library(dplyr)
df %>%
group_by(year, month) %>%
summarise(n = sum(n))
# A tibble: 4 x 3
# Groups: year [1]
year month n
<dbl> <dbl> <int>
1 2011 1 8
2 2011 2 11
3 2011 3 4
4 2011 4 9
Using base R with aggregate
aggregate(n ~ ., df, sum)
# year month n
#1 2011 1 8
#2 2011 2 11
#3 2011 3 4
#4 2011 4 9
Sorry if this post is not well organized, first time stack overflower...
I am trying to create a column to create a order within each IDs, but the twist is that if there is a gap year, order needs to start from the beginning.
Please check example and expected result below.
I wasn't able to find appropriate code for it.. I cannot think of anything :( Please help me! I appreciate alot!
One option is to create a new group variable when difference between the year is greater than 1 and create a sequence in each group using row_number().
library(dplyr)
df %>%
group_by(ID, group = cumsum(c(1, diff(Year) > 1))) %>%
mutate(order = row_number()) %>%
ungroup() %>%
select(-group)
# ID Year order
# <fct> <int> <int>
# 1 A 2007 1
# 2 A 2008 2
# 3 A 2009 3
# 4 A 2013 1
# 5 A 2014 2
# 6 A 2015 3
# 7 A 2016 4
# 8 B 2010 1
# 9 B 2012 1
#10 B 2013 2
Using base R ave that would be
as.integer(with(df, ave(ID, ID, cumsum(c(1, diff(Year) > 1)), FUN = seq_along)))
#[1] 1 2 3 1 2 3 4 1 1 2
data
df <- data.frame(ID = c(rep("A", 7), rep("B", 3)),
Year = c(2007:2009, 2013:2016, 2010, 2012, 2013), stringsAsFactors = FALSE)
A data.table option:
library(data.table)
setDT(df)
df[, jump := Year - shift(Year) - 1, by = ID
][is.na(jump), jump := 0
][, order := seq_len(.N), by = .(ID, cumsum(jump))]
# ID Year jump order
# 1: A 2007 0 1
# 2: A 2008 0 2
# 3: A 2009 0 3
# 4: A 2013 3 1
# 5: A 2014 0 2
# 6: A 2015 0 3
# 7: A 2016 0 4
# 8: B 2010 0 1
# 9: B 2012 1 1
# 10: B 2013 0 2
Or using data.table::nafill() available in data.table v1.12.3 (still in development):
df[, jump := nafill(Year - shift(Year) - 1, fill = 0), by = ID
][, order := seq_len(.N), by = .(ID, cumsum(jump))]
We can take the difference of 'Year' and the lag of 'Year', get the cumulative sum, use that in the group_by along with 'ID' and create the order as row_number()
library(dplyr)
df %>%
group_by(ID, grp = cumsum(Year - lag(Year, default = Year[1]) > 1)) %>%
mutate(order = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 10 x 3
# ID Year order
# <chr> <dbl> <int>
# 1 A 2007 1
# 2 A 2008 2
# 3 A 2009 3
# 4 A 2013 1
# 5 A 2014 2
# 6 A 2015 3
# 7 A 2016 4
# 8 B 2010 1
# 9 B 2012 1
#10 B 2013 2
data
df <- structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "B",
"B", "B"), Year = c(2007, 2008, 2009, 2013, 2014, 2015, 2016,
2010, 2012, 2013)), class = "data.frame", row.names = c(NA, -10L
))