count total in column based on other columns - R - r

How do I convert the dataframe?
Before:
set.seed(1)
df <- data.frame( n = rpois(16, 2),
year = rep(2011, 16),
month = rep(seq(1,4,1), times = rep(4,4)))
After:
df1 <- data.frame( n = c(8,11,4,9),
year = rep(2011, 4),
month = rep(seq(1,4,1)))

I think that what you want is this, using dplyr:
library(dplyr)
df %>%
group_by(year, month) %>%
summarise(n = sum(n))
# A tibble: 4 x 3
# Groups: year [1]
year month n
<dbl> <dbl> <int>
1 2011 1 8
2 2011 2 11
3 2011 3 4
4 2011 4 9

Using base R with aggregate
aggregate(n ~ ., df, sum)
# year month n
#1 2011 1 8
#2 2011 2 11
#3 2011 3 4
#4 2011 4 9

Related

R: Pivoting multi-year spells into multiple year rows

I have a dataset where each row represents a continuous spell with start and end months and years. For spells which are over more than one year, I want to pivot them so that there is one row per year.
Input:
library(data.table)
dat <- data.table(id = c(1,1,2), b_sp_y = c(2008, 2009, 2011), b_sp_m = c(3, 8, 6),
e_sp_y = c(2008, 2010, 2013), e_sp_m = c(5, 1, 9))
id b_sp_y b_sp_m e_sp_y e_sp_m
1: 1 2008 3 2008 5
2: 1 2009 8 2010 1
3: 2 2011 6 2013 9
Here is my truly horrifyingly ugly code:
dat[, y_dif := e_sp_y - b_sp_y]
res <- dat[y_dif == 0][, c("e_sp_y", "y_dif") := NULL]
setnames(res, "b_sp_y", "year")
tmp <- dat[y_dif > 0]
for(i in 1:nrow(tmp)){
foo <- tmp[i, ]
foo2 <- data.table(year = foo$b_sp_y:(foo$b_sp_y + foo$y_dif))[,id := foo$id]
foo2[, b_sp_m := c(foo$b_sp_m, rep(1, foo$y_dif))]
foo2[, e_sp_m := c(rep(12, foo$y_dif), foo$e_sp_m)]
res <- rbind(res, foo2)
}
Output:
id year b_sp_m e_sp_m
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
This is ugly and slow to a crawl, but I couldn't really come up with anything better.
Thanks for your help!
Proceeding by row fill in the three columns using summarize as shown.
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * year, 1, b_sp_m),
e_sp_m = replace(12 + 0 * year, length(year), e_sp_m))
giving:
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
or using only data.table:
library(data.table)
dat[, .(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * b_sp_y:e_sp_y, 1, b_sp_m),
e_sp_m = replace(12 + 0 * b_sp_y:e_sp_y, e_sp_y - b_sp_y + 1, e_sp_m)),
by = 1:nrow(dat)][, -1]
Added
Here are some slightly more compact variations of the above:
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
library(data.table)
dat[, {
year <- b_sp_y:e_sp_y
.(id = id,
year = year,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
},
by = 1:nrow(dat)][, -1]
I'd suggest: make a date sequence for each id/row, group by id and year, summarize first and last month.
library(dplyr); library(lubridate)
dat %>%
mutate(start = ymd(paste(b_sp_y, b_sp_m, "01", sep = "-")),
end = ymd(paste(e_sp_y, e_sp_m, "01", sep = "-"))) %>%
group_by(id, row = row_number()) %>%
summarize(months = seq.Date(start, end, by = "month")) %>%
group_by(id, year = year(months)) %>%
summarize(from = month(min(months)),
to = month(max(months)), .groups = "drop")
Result:
# A tibble: 6 × 4
id year from to
<dbl> <dbl> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
We create a sequence column 'rn', loop over the year columns, get the sequence in a list, unnest the column, and do a group by the 'rn' and replace the 'b', 'e' columns where there are duplicates to 1 and 12 respectively
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
mutate(rn=row_number(),
year = map2(b_sp_y, e_sp_y, `:`),
b_sp_y= NULL,
e_sp_y = NULL) %>%
unnest(year) %>%
group_by(rn) %>%
mutate(b_sp_m = replace(b_sp_m, duplicated(b_sp_m), 1),
e_sp_m = replace(e_sp_m, duplicated(e_sp_m, fromLast = TRUE) &
n() > 1, 12)) %>%
ungroup %>%
select(-rn) %>%
relocate(year, .after = 1)
-output
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
OP's output of 'res'
> res
id year b_sp_m e_sp_m
<num> <num> <num> <num>
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9

horizontal and vertical join count in r dataframe

having a dataframe with sales per customer and months.
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d")
date customer product
1 jan john a
2 jan john b
3 jan john d
4 jan Mary a
5 jan Mary b
6 jan Mary c
7 jan Mary d
8 feb Robert a
9 feb Robert b
10 feb Mary c
11 feb john a
12 feb john c
13 feb Robert c
14 feb Robert d
I need to summarize how many times the same customer is present across months and products.
Expected result:
date a b c d same cust
jan 2 2 1 2 0
feb 2 1 2 0 1
same cust 1 0 1 0
A possible solution:
library(tidyverse)
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d"))
df %>%
pivot_wider(date,names_from=product,values_from=customer,values_fn=length)%>%
bind_cols(SCust = table(df$customer, df$date) %>% apply(2, \(x) sum(x>=2))) %>%
bind_rows(c(tibble(date="SCust"),
table(df$customer, df$product) %>% apply(2, \(x) sum(x>=2))))
#> # A tibble: 3 × 6
#> date a b d c SCust
#> <chr> <int> <int> <int> <int> <int>
#> 1 jan 2 2 2 1 2
#> 2 feb 2 1 1 3 2
#> 3 SCust 1 0 0 1 NA
I don't know about the marginals, but for the main table
library(reshape2)
dcast(
df,
date~product,
function(x){length(unique(x))},
value.var="customer"
)
date a b c d
1 feb 2 1 3 1
2 jan 2 2 1 2
You can try
library(tidyverse)
df %>%
pivot_wider(names_from = product, values_from = customer, values_fn = n_distinct) %>%
bind_rows(
df %>%
count(product, customer) %>%
group_by(product) %>%
summarise(n=sum(n-1),
date = "all") %>%
pivot_wider(names_from = product,values_from=n ))
# A tibble: 3 x 5
date a b d c
<chr> <dbl> <dbl> <dbl> <dbl>
1 jan 2 2 2 1
2 feb 2 1 1 3
3 all 1 0 0 1
dt <- data.frame(stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan", "jan","jan","jan","feb","feb","feb","feb","feb","feb","feb"),
customer = c("john","john","john","Mary", "Mary","Mary","Mary","Robert","Robert","Mary","john","john","Robert","Robert"),
product = c("a","b","d","a","b","c","d","a","b","c","a","c","c","d")
)
library(data.table)
setDT(dt)
setorder(dt, product)
rbindlist(list(
dcast(dt[, .(value = .N), by = .(date, product)], date ~ product),
transpose(dt[, .(same_cust_row = .N - length(unique(customer))), by = .(product)], make.names = "product", keep.names = "date")
))
# date a b c d
# 1: feb 2 1 3 1
# 2: jan 2 2 1 2
# 3: same_cust_row 1 0 1 0
Do you need the "detail" data, or just the summary ("same cust") data?
library(dplyr)
library(tidyr)
library(purrr)
# by month / same customer bought in both months
df %>% pivot_wider(names_from = product, values_from = date, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==2))
$a
[1] 1
$b
[1] 0
$d
[1] 0
$c
[1] 1
# by month / same customer bought all (4) products
z <- df %>% pivot_wider(names_from = date, values_from = product, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==4))
$jan
[1] NA
$feb
[1] 1

How do I create a new factor level that summarizes total values of other factor levels?

I have a dataset where I have at least three columns
year sex value
1 2019 M 10
2 2019 F 20
3 2020 M 50
4 2020 F 20
I would like to group by the first column, year, and then add another level to sex that corresponds the total value in column 3, that is, I would like something like this:
year sex value
<int> <chr> <dbl>
1 2019 M 10
2 2019 F 20
3 2019 Total 30
4 2020 M 50
5 2020 F 20
6 2020 Total 70
Any help is appreciated, especially in dplyr.
Here is just another way of doing this:
library(dplyr)
library(purrr)
df %>%
group_split(year) %>%
map_dfr(~ add_row(.x, year = first(.x$year), sex = "Total", value = sum(.x$value)))
# A tibble: 6 x 3
year sex value
<int> <chr> <dbl>
1 2019 M 10
2 2019 F 20
3 2019 Total 30
4 2020 M 50
5 2020 F 20
6 2020 Total 70
You can summarise the data for each year and bind it to the original dataset.
library(dplyr)
df %>%
group_by(year) %>%
summarise(sex = 'Total',
value = sum(value)) %>%
bind_rows(df) %>%
arrange(year, sex)
# year sex value
# <int> <chr> <dbl>
#1 2019 F 20
#2 2019 M 10
#3 2019 Total 30
#4 2020 F 20
#5 2020 M 50
#6 2020 Total 70
Or in base R -
aggregate(value~year, df, sum) |>
transform(sex = 'Total') |>
rbind(df)
data
df <- data.frame(year = rep(2019:2020, each = 2),
sex = c('M', 'F'), value = c(10, 20, 50, 20))

How to create a previous row difference based on a columns in R

I have a data frame like this:
my_df <- data.frame(
year = c("2018","2018","2017","2017", "2016","2016"),
my_month = c(6,7,8,9,4,5),
val=c(5,9,3,2,1,1))
> my_df
year my_month val
1 2018 6 5
2 2018 7 9
3 2017 8 3
4 2017 9 2
5 2016 4 1
6 2016 4 1
I need a data frame like this:
my_df_2 <- data.frame(
year = c("2018","2018","2017","2017", "2016","2016"),
my_month = c(6,7,8,9,4,5),
val=c(5,9,3,2,1,1),
pre_month = c(NA,4,NA,-1,NA,0))
> my_df_2
year my_month val pre_month
1 2018 6 5 NA
2 2018 7 9 4
3 2017 8 3 NA
4 2017 9 2 -1
5 2016 4 1 NA
6 2016 5 1 0
Basically "pre_month" col is created by taking "my_month" row for that particular year and subtracting the value of previous month in "val" column. So far 7-2018 -> 9-5=4 and so on.
Thank you for your help.
Here's a solution using tidyverse.
my_df <- data.frame(
year = c("2018","2018","2017","2017", "2016","2016"),
my_month = c(6,7,8,9,4,5),
val=c(5,9,3,2,1,1))
library(tidyverse)
my_df %>%
mutate(year = as.numeric(year)) %>%
group_by(year) %>%
arrange(my_month) %>%
mutate(pre_month = c(NA, diff(val))) %>%
arrange(desc(year))
I changed year to a numeric so it could be sorted sensibly.

Insert rows corresponding to each month in an yearly data

I have a dataframe with yearly data from 2014 to 2018. I wish to expand this dataframe into monthly values and basically divide each variable's value by 12 as each month's value.
Please note: There is no month column in my dataframe as of now.
So, if there are 5 products, I have 5*5 rows and 5 columns :"year", "Product_ID", "Var1", "Var2" and "Var3" as the columns.
Eventually, I want 5*12 rows and 6 column with "month" inserted as well.
I have tried this code but it isnt working:
df_new$date <- NA
df_new <- complete(df,Product_ID, date = full_seq(2014,1))
Any suggestions?
One option is to use uncount to repeat rows 12 times, create a new column month to take 1:12 value for each year and then divide Var columns by 12.
library(dplyr)
library(tidyr)
df %>%
uncount(12) %>%
group_by(year) %>%
mutate(month = 1:12) %>%
mutate_at(vars(Var1, Var2), ~./12)
# Groups: year [3]
# year Product_ID Var1 Var2 month
# <int> <chr> <dbl> <dbl> <int>
# 1 2013 A 0.833 5 1
# 2 2013 A 0.833 5 2
# 3 2013 A 0.833 5 3
# 4 2013 A 0.833 5 4
# 5 2013 A 0.833 5 5
# 6 2013 A 0.833 5 6
# 7 2013 A 0.833 5 7
# 8 2013 A 0.833 5 8
# 9 2013 A 0.833 5 9
#10 2013 A 0.833 5 10
# … with 26 more rows
Or another option with complete and fill
df %>%
mutate(month = 1) %>%
complete(year, month = 1:12) %>%
fill(Product_ID, Var1, Var2) %>%
mutate_at(vars(Var1, Var2), ~./12)
data
df <- data.frame(year = 2013:2015, Product_ID = c("A", "B", "C"),
Var1 = c(10, 20, 30), Var2 = c(60, 80, 120), stringsAsFactors = FALSE)

Resources