horizontal and vertical join count in r dataframe - r

having a dataframe with sales per customer and months.
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d")
date customer product
1 jan john a
2 jan john b
3 jan john d
4 jan Mary a
5 jan Mary b
6 jan Mary c
7 jan Mary d
8 feb Robert a
9 feb Robert b
10 feb Mary c
11 feb john a
12 feb john c
13 feb Robert c
14 feb Robert d
I need to summarize how many times the same customer is present across months and products.
Expected result:
date a b c d same cust
jan 2 2 1 2 0
feb 2 1 2 0 1
same cust 1 0 1 0

A possible solution:
library(tidyverse)
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d"))
df %>%
pivot_wider(date,names_from=product,values_from=customer,values_fn=length)%>%
bind_cols(SCust = table(df$customer, df$date) %>% apply(2, \(x) sum(x>=2))) %>%
bind_rows(c(tibble(date="SCust"),
table(df$customer, df$product) %>% apply(2, \(x) sum(x>=2))))
#> # A tibble: 3 × 6
#> date a b d c SCust
#> <chr> <int> <int> <int> <int> <int>
#> 1 jan 2 2 2 1 2
#> 2 feb 2 1 1 3 2
#> 3 SCust 1 0 0 1 NA

I don't know about the marginals, but for the main table
library(reshape2)
dcast(
df,
date~product,
function(x){length(unique(x))},
value.var="customer"
)
date a b c d
1 feb 2 1 3 1
2 jan 2 2 1 2

You can try
library(tidyverse)
df %>%
pivot_wider(names_from = product, values_from = customer, values_fn = n_distinct) %>%
bind_rows(
df %>%
count(product, customer) %>%
group_by(product) %>%
summarise(n=sum(n-1),
date = "all") %>%
pivot_wider(names_from = product,values_from=n ))
# A tibble: 3 x 5
date a b d c
<chr> <dbl> <dbl> <dbl> <dbl>
1 jan 2 2 2 1
2 feb 2 1 1 3
3 all 1 0 0 1

dt <- data.frame(stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan", "jan","jan","jan","feb","feb","feb","feb","feb","feb","feb"),
customer = c("john","john","john","Mary", "Mary","Mary","Mary","Robert","Robert","Mary","john","john","Robert","Robert"),
product = c("a","b","d","a","b","c","d","a","b","c","a","c","c","d")
)
library(data.table)
setDT(dt)
setorder(dt, product)
rbindlist(list(
dcast(dt[, .(value = .N), by = .(date, product)], date ~ product),
transpose(dt[, .(same_cust_row = .N - length(unique(customer))), by = .(product)], make.names = "product", keep.names = "date")
))
# date a b c d
# 1: feb 2 1 3 1
# 2: jan 2 2 1 2
# 3: same_cust_row 1 0 1 0

Do you need the "detail" data, or just the summary ("same cust") data?
library(dplyr)
library(tidyr)
library(purrr)
# by month / same customer bought in both months
df %>% pivot_wider(names_from = product, values_from = date, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==2))
$a
[1] 1
$b
[1] 0
$d
[1] 0
$c
[1] 1
# by month / same customer bought all (4) products
z <- df %>% pivot_wider(names_from = date, values_from = product, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==4))
$jan
[1] NA
$feb
[1] 1

Related

R: Pivoting multi-year spells into multiple year rows

I have a dataset where each row represents a continuous spell with start and end months and years. For spells which are over more than one year, I want to pivot them so that there is one row per year.
Input:
library(data.table)
dat <- data.table(id = c(1,1,2), b_sp_y = c(2008, 2009, 2011), b_sp_m = c(3, 8, 6),
e_sp_y = c(2008, 2010, 2013), e_sp_m = c(5, 1, 9))
id b_sp_y b_sp_m e_sp_y e_sp_m
1: 1 2008 3 2008 5
2: 1 2009 8 2010 1
3: 2 2011 6 2013 9
Here is my truly horrifyingly ugly code:
dat[, y_dif := e_sp_y - b_sp_y]
res <- dat[y_dif == 0][, c("e_sp_y", "y_dif") := NULL]
setnames(res, "b_sp_y", "year")
tmp <- dat[y_dif > 0]
for(i in 1:nrow(tmp)){
foo <- tmp[i, ]
foo2 <- data.table(year = foo$b_sp_y:(foo$b_sp_y + foo$y_dif))[,id := foo$id]
foo2[, b_sp_m := c(foo$b_sp_m, rep(1, foo$y_dif))]
foo2[, e_sp_m := c(rep(12, foo$y_dif), foo$e_sp_m)]
res <- rbind(res, foo2)
}
Output:
id year b_sp_m e_sp_m
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
This is ugly and slow to a crawl, but I couldn't really come up with anything better.
Thanks for your help!
Proceeding by row fill in the three columns using summarize as shown.
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * year, 1, b_sp_m),
e_sp_m = replace(12 + 0 * year, length(year), e_sp_m))
giving:
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
or using only data.table:
library(data.table)
dat[, .(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * b_sp_y:e_sp_y, 1, b_sp_m),
e_sp_m = replace(12 + 0 * b_sp_y:e_sp_y, e_sp_y - b_sp_y + 1, e_sp_m)),
by = 1:nrow(dat)][, -1]
Added
Here are some slightly more compact variations of the above:
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
library(data.table)
dat[, {
year <- b_sp_y:e_sp_y
.(id = id,
year = year,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
},
by = 1:nrow(dat)][, -1]
I'd suggest: make a date sequence for each id/row, group by id and year, summarize first and last month.
library(dplyr); library(lubridate)
dat %>%
mutate(start = ymd(paste(b_sp_y, b_sp_m, "01", sep = "-")),
end = ymd(paste(e_sp_y, e_sp_m, "01", sep = "-"))) %>%
group_by(id, row = row_number()) %>%
summarize(months = seq.Date(start, end, by = "month")) %>%
group_by(id, year = year(months)) %>%
summarize(from = month(min(months)),
to = month(max(months)), .groups = "drop")
Result:
# A tibble: 6 × 4
id year from to
<dbl> <dbl> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
We create a sequence column 'rn', loop over the year columns, get the sequence in a list, unnest the column, and do a group by the 'rn' and replace the 'b', 'e' columns where there are duplicates to 1 and 12 respectively
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
mutate(rn=row_number(),
year = map2(b_sp_y, e_sp_y, `:`),
b_sp_y= NULL,
e_sp_y = NULL) %>%
unnest(year) %>%
group_by(rn) %>%
mutate(b_sp_m = replace(b_sp_m, duplicated(b_sp_m), 1),
e_sp_m = replace(e_sp_m, duplicated(e_sp_m, fromLast = TRUE) &
n() > 1, 12)) %>%
ungroup %>%
select(-rn) %>%
relocate(year, .after = 1)
-output
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
OP's output of 'res'
> res
id year b_sp_m e_sp_m
<num> <num> <num> <num>
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9

Find max value for each partition in dataframe in R

I have a data as:
ID Date1 VarA
1 2005-01-02 x
1 2021-01-02 20
1 2021-01-01 y
2 2020-12-20 No
2 2020-12-19 10
3 1998-05-01 0
Here is the R-code to reproduce the data
example = data.frame(ID = c(1,1,1,2,2,3),
Date1 = c('2005-01-02',
'2021-01-02',
'2021-01-01',
'2020-12-20',
'2020-12-19',
'1998-05-01'),
VarA = c('x','20','y','No', '10','0'))
I would prefer the solution to do following:
First, flag the maximum date in data.
ID Date1 VarA Last_visit
1 2005-01-02 x 0
1 2021-01-02 20 1
1 2021-01-01 y 0
2 2020-12-20 No 1
2 2020-12-19 10 0
3 1998-05-01 0 1
Finally, It should retain only where the Last_visit=1
ID Date1 VarA Last_visit
1 2021-01-02 20 1
2 2020-12-20 No 1
3 1998-05-01 0 1
I am requesting the intermediate steps as well to perform a sanity check. Thanks!
We create a new column after grouping by 'ID'
library(dplyr)
example %>%
group_by(ID) %>%
mutate(Last_visit = +(row_number() %in% which.max(as.Date(Date1)))) %>%
ungroup
and then filter/slice based on the column
example %>%
group_by(ID) %>%
mutate(Last_visit = +(row_number() %in% which.max(as.Date(Date1)))) %>%
slice_max(n = 1, order_by = Last_visit) %>%
ungroup
-output
# A tibble: 3 × 4
ID Date1 VarA Last_visit
<dbl> <chr> <chr> <int>
1 1 2021-01-02 20 1
2 2 2020-12-20 No 1
3 3 1998-05-01 0 1
Another option is to convert the 'Date1' to Date class first, then do an arrange and use distinct
example %>%
mutate(Date1 = as.Date(Date1)) %>%
arrange(ID, desc(Date1)) %>%
distinct(ID, .keep_all = TRUE) %>%
mutate(Last_visit = 1)
ID Date1 VarA Last_visit
1 1 2021-01-02 20 1
2 2 2020-12-20 No 1
3 3 1998-05-01 0 1

Splitting the date alongside pivot_wider

I've the following table.
Date
Cat
15/2/1999
A
15/2/1999
A
15/2/1999
B
15/5/1999
A
15/5/1999
B
15/10/1999
C
15/10/1999
C
15/2/2001
A
15/2/2001
A
15/6/2001
B
15/6/2001
B
15/6/2001
C
15/11/2001
C
15/11/2001
C
I would like to apply pivot_wider (or any other similar functions) to it and also accounting for the Date and Year column as seen below. The Cat column is being split based on the variable A, B and C and the count is being displayed.
Month
Year
A
B
C
Total
February
1999
2
1
0
3
May
1999
1
1
0
2
October
1999
0
0
2
2
February
2001
2
0
0
2
June
2001
0
2
1
3
November
2001
0
0
2
2
Does anyone here knows how I can do both together? Thanks
You can do this with tidyverse packages. First, format your date column as date, then count by month, pivot to wider and format the table.
library(tidyverse)
data %>%
mutate(Date = as.Date(Date, format = "%d/%m/%Y")) %>%
group_by(Cat, month = lubridate::floor_date(Date, "month")) %>%
count(Cat) %>%
pivot_wider(names_from = Cat, values_from = n, values_fill = 0) %>%
mutate(year = year(month), .before = "A",
month = month(month, label = T, abbr = F)) %>%
mutate(Total = rowSums(across(A:C))) %>%
arrange(year)
month year A B C Total
<ord> <dbl> <int> <int> <int> <dbl>
1 February 1999 2 1 0 3
2 May 1999 1 1 0 2
3 October 1999 0 0 2 2
4 February 2001 2 0 0 2
5 June 2001 0 2 1 3
6 November 2001 0 0 2 2
data
data <- structure(list(Date = c("15/2/1999", "15/2/1999", "15/2/1999",
"15/5/1999", "15/5/1999", "15/10/1999", "15/10/1999", "15/2/2001",
"15/2/2001", "15/6/2001", "15/6/2001", "15/6/2001", "15/11/2001",
"15/11/2001"), Cat = c("A", "A", "B", "A", "B", "C", "C", "A",
"A", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-14L))
Another possible solution:
library(tidyverse)
library(lubridate)
df <- data.frame(
stringsAsFactors = FALSE,
Date = c("15/2/1999",
"15/2/1999","15/2/1999","15/5/1999","15/5/1999",
"15/10/1999","15/10/1999","15/2/2001","15/2/2001",
"15/6/2001","15/6/2001","15/6/2001","15/11/2001",
"15/11/2001"),
Cat = c("A","A","B","A",
"B","C","C","A","A","B","B","C","C","C")
)
df %>%
mutate(Month = month(Date, label = TRUE), Year = year(dmy(Date))) %>%
pivot_wider(id_cols = c(Month, Year), names_from = Cat,
values_from = Cat, values_fn = length, values_fill = 0) %>%
mutate(Total = rowSums(.[3:5]))
#> # A tibble: 6 × 6
#> Month Year A B C Total
#> <ord> <dbl> <int> <int> <int> <dbl>
#> 1 Feb 1999 2 1 0 3
#> 2 May 1999 1 1 0 2
#> 3 Oct 1999 0 0 2 2
#> 4 Feb 2001 2 0 0 2
#> 5 Jun 2001 0 2 1 3
#> 6 Nov 2001 0 0 2 2

count total in column based on other columns - R

How do I convert the dataframe?
Before:
set.seed(1)
df <- data.frame( n = rpois(16, 2),
year = rep(2011, 16),
month = rep(seq(1,4,1), times = rep(4,4)))
After:
df1 <- data.frame( n = c(8,11,4,9),
year = rep(2011, 4),
month = rep(seq(1,4,1)))
I think that what you want is this, using dplyr:
library(dplyr)
df %>%
group_by(year, month) %>%
summarise(n = sum(n))
# A tibble: 4 x 3
# Groups: year [1]
year month n
<dbl> <dbl> <int>
1 2011 1 8
2 2011 2 11
3 2011 3 4
4 2011 4 9
Using base R with aggregate
aggregate(n ~ ., df, sum)
# year month n
#1 2011 1 8
#2 2011 2 11
#3 2011 3 4
#4 2011 4 9

keep only consecutive observations

As said in the title, I have a data.frame like below,
df<-data.frame('id'=c('1','1','1','1','1','1','1'),'time'=c('1998','2000','2001','2002','2003','2004','2007'))
df
id time
1 1 1998
2 1 2000
3 1 2001
4 1 2002
5 1 2003
6 1 2004
7 1 2007
there are some others cases with shorter or longer time window than this,just for illustration's sake.
I want to do two things about this data set, first, find all those id that have at least five consecutive observations here, this can be done by following solutions here. Second, I want to keep only those observations in the at least five consecutive row of id selected by first step. The ideal result would be :
df
id time
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004
I could write a complex function using for loop and diff function, but this may be very time consuming both in writing the function and getting the result if I have a bigger data set with lots if id. But this is not seems like R and I do believe there should be a one or two line solution.
Anyone know how to achieve this? your time and knowledge would be deeply appreciated. Thanks in advance.
You can use dplyr to group by id and consecutive time, and filter groups with less than 5 entries, i.e.
#read data with stringsAsFactors = FALSE
df<-data.frame('id'=c('1','1','1','1','1','1','1'),
'time'=c('1998','2000','2001','2002','2003','2004','2007'),
stringsAsFactors = FALSE)
library(dplyr)
df %>%
mutate(time = as.integer(time)) %>%
group_by(id, grp = cumsum(c(1, diff(time) != 1))) %>%
filter(n() >= 5)
which gives
# A tibble: 5 x 3
# Groups: id, grp [1]
id time grp
<chr> <int> <dbl>
1 1 2000 2
2 1 2001 2
3 1 2002 2
4 1 2003 2
5 1 2004 2
Similar to #Sotos answer, this solution instead uses seqle (from cgwtools) as the grouping variable:
library(dplyr)
library(cgwtools)
df %>%
mutate(time = as.numeric(time)) %>%
group_by(id, consec = rep(seqle(time)$length, seqle(time)$length)) %>%
filter(consec >= 5)
Result:
# A tibble: 5 x 3
# Groups: id, consec [1]
id time consec
<chr> <dbl> <int>
1 1 2000 5
2 1 2001 5
3 1 2002 5
4 1 2003 5
5 1 2004 5
To remove grouping variable:
df %>%
mutate(time = as.numeric(time)) %>%
group_by(id, consec = rep(seqle(time)$length, seqle(time)$length)) %>%
filter(consec >= 5) %>%
ungroup() %>%
select(-consec)
Result:
# A tibble: 5 x 2
id time
<chr> <dbl>
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004
Data:
df<-data.frame('id'=c('1','1','1','1','1','1','1'),
'time'=c('1998','2000','2001','2002','2003','2004','2007'),
stringsAsFactors = FALSE)
Try that on your data:
df[,] <- lapply(df, function(x) type.convert(as.character(x), as.is = TRUE))
IND1 <- (df$time - c(df$time[-1],df$time[length(df$time)-1])) %>% abs(.)
IND2 <- (df$time - c(df$time[2],df$time[-(length(df$time))])) %>% abs(.)
df <- df[IND1 %in% 1 | IND2 %in% 1,]
df[ave(df$time, df$id, FUN = length) >= 5, ]
A solution from dplyr, tidyr, and data.table.
library(dplyr)
library(tidyr)
library(data.table)
df2 <- df %>%
mutate(time = as.numeric(as.character(time))) %>%
arrange(id, time) %>%
right_join(data_frame(time = full_seq(.$time, 1)), by = "time") %>%
mutate(RunID = rleid(id)) %>%
group_by(RunID) %>%
filter(n() >= 5, !is.na(id)) %>%
ungroup() %>%
select(-RunID)
df2
# A tibble: 5 x 2
id time
<fctr> <dbl>
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004

Resources