R groupby and divide column by a value in column - r

I have a dataframe like so:
id year month val
1 2020 1 50
1 2020 7 80
1 2021 1 40
1 2021 7 70
.
.
Now, I want to index all the values using Jan 2020 as index year for each id. Essentially group by id, then divide val with val at Jan 2020 * 100. So the final dataframe would look something like this:
id year month val
1 2020 1 100
1 2020 7 160
1 2021 1 80
1 2021 7 140
.
.
This is what I tried till now:
df %>% group_by(id) %>% mutate(val = 100*val/[val at Jan 2020])
I can separately get val at Jan 2020 like so:
df %>% filter(year==2020, month==1) %>% select(val)
But it doesn't work together:
df %>% group_by(id) %>% mutate(val = 100*val/(df %>% filter(year==2020, month==1) %>% select(val)))
The above throws error

A dplyr approach
library(dplyr)
df %>%
group_by(id) %>%
mutate(val = val / val[year == 2020 & month == 1] * 100) %>%
ungroup()
# A tibble: 4 × 4
id year month val
<int> <int> <int> <dbl>
1 1 2020 1 100
2 1 2020 7 160
3 1 2021 1 80
4 1 2021 7 140

Base R
do.call(
rbind,
lapply(
split(df,df$id),
function(x){
cbind(
subset(x,select=-c(val)),
"val"=x$val/x$val[x$year==2020 & x$month==1]*100
)
}
)
)
id year month val
1.1 1 2020 1 100
1.2 1 2020 7 160
1.3 1 2021 1 80
1.4 1 2021 7 140

Related

horizontal and vertical join count in r dataframe

having a dataframe with sales per customer and months.
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d")
date customer product
1 jan john a
2 jan john b
3 jan john d
4 jan Mary a
5 jan Mary b
6 jan Mary c
7 jan Mary d
8 feb Robert a
9 feb Robert b
10 feb Mary c
11 feb john a
12 feb john c
13 feb Robert c
14 feb Robert d
I need to summarize how many times the same customer is present across months and products.
Expected result:
date a b c d same cust
jan 2 2 1 2 0
feb 2 1 2 0 1
same cust 1 0 1 0
A possible solution:
library(tidyverse)
df <-
data.frame(
stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan",
"jan","jan","jan","feb","feb","feb","feb","feb",
"feb","feb"),
customer = c("john","john","john","Mary",
"Mary","Mary","Mary","Robert","Robert","Mary",
"john","john","Robert","Robert"),
product = c("a","b","d","a","b","c",
"d","a","b","c","a","c","c","d"))
df %>%
pivot_wider(date,names_from=product,values_from=customer,values_fn=length)%>%
bind_cols(SCust = table(df$customer, df$date) %>% apply(2, \(x) sum(x>=2))) %>%
bind_rows(c(tibble(date="SCust"),
table(df$customer, df$product) %>% apply(2, \(x) sum(x>=2))))
#> # A tibble: 3 × 6
#> date a b d c SCust
#> <chr> <int> <int> <int> <int> <int>
#> 1 jan 2 2 2 1 2
#> 2 feb 2 1 1 3 2
#> 3 SCust 1 0 0 1 NA
I don't know about the marginals, but for the main table
library(reshape2)
dcast(
df,
date~product,
function(x){length(unique(x))},
value.var="customer"
)
date a b c d
1 feb 2 1 3 1
2 jan 2 2 1 2
You can try
library(tidyverse)
df %>%
pivot_wider(names_from = product, values_from = customer, values_fn = n_distinct) %>%
bind_rows(
df %>%
count(product, customer) %>%
group_by(product) %>%
summarise(n=sum(n-1),
date = "all") %>%
pivot_wider(names_from = product,values_from=n ))
# A tibble: 3 x 5
date a b d c
<chr> <dbl> <dbl> <dbl> <dbl>
1 jan 2 2 2 1
2 feb 2 1 1 3
3 all 1 0 0 1
dt <- data.frame(stringsAsFactors = FALSE,
date = c("jan","jan","jan","jan", "jan","jan","jan","feb","feb","feb","feb","feb","feb","feb"),
customer = c("john","john","john","Mary", "Mary","Mary","Mary","Robert","Robert","Mary","john","john","Robert","Robert"),
product = c("a","b","d","a","b","c","d","a","b","c","a","c","c","d")
)
library(data.table)
setDT(dt)
setorder(dt, product)
rbindlist(list(
dcast(dt[, .(value = .N), by = .(date, product)], date ~ product),
transpose(dt[, .(same_cust_row = .N - length(unique(customer))), by = .(product)], make.names = "product", keep.names = "date")
))
# date a b c d
# 1: feb 2 1 3 1
# 2: jan 2 2 1 2
# 3: same_cust_row 1 0 1 0
Do you need the "detail" data, or just the summary ("same cust") data?
library(dplyr)
library(tidyr)
library(purrr)
# by month / same customer bought in both months
df %>% pivot_wider(names_from = product, values_from = date, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==2))
$a
[1] 1
$b
[1] 0
$d
[1] 0
$c
[1] 1
# by month / same customer bought all (4) products
z <- df %>% pivot_wider(names_from = date, values_from = product, values_fn = length) %>%
select(-customer) %>%
map( ~ sum(.x==4))
$jan
[1] NA
$feb
[1] 1

How do I create a new factor level that summarizes total values of other factor levels?

I have a dataset where I have at least three columns
year sex value
1 2019 M 10
2 2019 F 20
3 2020 M 50
4 2020 F 20
I would like to group by the first column, year, and then add another level to sex that corresponds the total value in column 3, that is, I would like something like this:
year sex value
<int> <chr> <dbl>
1 2019 M 10
2 2019 F 20
3 2019 Total 30
4 2020 M 50
5 2020 F 20
6 2020 Total 70
Any help is appreciated, especially in dplyr.
Here is just another way of doing this:
library(dplyr)
library(purrr)
df %>%
group_split(year) %>%
map_dfr(~ add_row(.x, year = first(.x$year), sex = "Total", value = sum(.x$value)))
# A tibble: 6 x 3
year sex value
<int> <chr> <dbl>
1 2019 M 10
2 2019 F 20
3 2019 Total 30
4 2020 M 50
5 2020 F 20
6 2020 Total 70
You can summarise the data for each year and bind it to the original dataset.
library(dplyr)
df %>%
group_by(year) %>%
summarise(sex = 'Total',
value = sum(value)) %>%
bind_rows(df) %>%
arrange(year, sex)
# year sex value
# <int> <chr> <dbl>
#1 2019 F 20
#2 2019 M 10
#3 2019 Total 30
#4 2020 F 20
#5 2020 M 50
#6 2020 Total 70
Or in base R -
aggregate(value~year, df, sum) |>
transform(sex = 'Total') |>
rbind(df)
data
df <- data.frame(year = rep(2019:2020, each = 2),
sex = c('M', 'F'), value = c(10, 20, 50, 20))

How best to transform quarterly data into monthly

Below is the sample data. I receive the data in a form such as this. Each row is a quarter and then the months are columns inside of it. Trying to do some month over month calculation but am thinking that I transform the data frame in order to do so. I am thinking that I would do a pivot_longer but not seeing anything online that is of a similar vein. Below is the desired result
year<-c(2018,2018,2018,2018,2019,2019,2019,2019,2020,2020,2020,2020)
qtr<-c(1,2,3,4,1,2,3,4,1,2,3,4)
avgemp <-c(3,5,7,9,11,13,15,17,19,21,23,25)
month1emp<-c(2,4,6,8,10,12,14,16,18,20,22,24)
month2emp<-c(3,5,7,9,11,13,15,17,19,21,23,25)
month3emp<-c(4,6,8,10,12,14,16,18,20,22,24,26)
sample<-data.frame(year,qtr,month1emp,month2emp,month3emp)
Desired Result
year qtr month employment
2018 1 1 2
2018 1 2 3
2018 1 3 4
2018 2 4 4
2018 2 4 5
2018 2 4 6
and so on. At 2019, the month value would restart and go from 1 to 12.
We could use pivot_longer on the 'month' columns, specify the names_pattern to capture the digits ((\\d+)) followed by the emp for the 'month' and the .value columns
library(dplyr)
library(tidyr)
sample %>%
pivot_longer(cols = starts_with('month'),
names_to = c("month", ".value"), names_pattern = ".*(\\d+)(emp)")%>%
rename(employment = emp)
-output
# A tibble: 36 x 4
year qtr month employment
<dbl> <dbl> <chr> <dbl>
1 2018 1 1 2
2 2018 1 2 3
3 2018 1 3 4
4 2018 2 1 4
5 2018 2 2 5
6 2018 2 3 6
7 2018 3 1 6
8 2018 3 2 7
9 2018 3 3 8
10 2018 4 1 8
# … with 26 more rows
If we need to increment the 'month' based on 'qtr' value
sample %>%
pivot_longer(cols = starts_with('month'),
names_to = c("month", ".value"), names_pattern = ".*(\\d+)(emp)")%>%
rename(employment = emp) %>%
mutate(month = as.integer(month) + c(0, 3, 6, 9)[qtr])
# A tibble: 36 x 4
year qtr month employment
<dbl> <dbl> <dbl> <dbl>
1 2018 1 1 2
2 2018 1 2 3
3 2018 1 3 4
4 2018 2 4 4
5 2018 2 5 5
6 2018 2 6 6
7 2018 3 7 6
8 2018 3 8 7
9 2018 3 9 8
10 2018 4 10 8
# … with 26 more rows
Base R solution:
# Create a vector of boolean values,
# denoting whether or not the columns should
# be unpivoted: unpivot_cols => boolean vector
unpivot_cols <- startsWith(
names(df),
"month"
)
# Reshape the data.frame, calculate
# the month value: rshpd_df => data.frame
rshpd_df <- transform(
reshape(
df,
direction = "long",
varying = names(df)[unpivot_cols],
ids = NULL,
timevar = "month",
times = seq_len(sum(unpivot_cols)),
v.names = "employment",
new.row.names = seq_len(
nrow(df) * ncol(df)
)
),
month = ((12 / 4) * (qtr - 1)) + month
)
# Order the data.frame by year and month:
# ordered_df => data.frame
ordered_df <- with(
rshpd_df,
rshpd_df[order(year, month),]
)

Using dplyr and group_by to calculate number of repetition for a value

I have a dataset which includes seller_ID, product_ID and year the product was sold, and I am trying to find the year that one seller had maximum sold product and the specific number of sold in that year for each individual seller. Here is an example of data
seller_ID <- c(1,1,1,2,2,3,4,4,4,4,4)
Product_ID <- c(1000,1000,1005,1004,1005,1003,1010,
1000,1001,1019,1017)
year <- c(2015,2016,2015,2020,2020,2000,2000,2001,2001,2001,2005)
data<- data.frame(seller_ID,Product_ID,year)
seller_ID Product_ID year
1 1 1000 2015
2 1 1000 2016
3 1 1005 2015
4 2 1004 2020
5 2 1005 2020
6 3 1003 2000
7 4 1010 2000
8 4 1000 2001
9 4 1001 2001
10 4 1019 2001
11 4 1017 2005
so the ideal result would be:
seller_ID Max_sold_num_year Max_year
1 1 2 2015
2 2 2 2020
3 3 1 2000
4 4 3 2001
I tried the approach I explained below and it worked ...
df_temp<- data %>%
group_by(seller_ID, year) %>%
summarize(Sold_in_Year=length(Product_ID))
unique_seller=unique(data$seller_ID)
ID_list=c()
Max_list=c()
Max_Sold_Year=c()
j=1
for (ID in unique_seller) {
df_temp_2<- subset(df_temp, df_temp$seller_ID==ID)
Max_year<- subset(df_temp_2,df_temp_2$Sold_in_Year==max(df_temp_2$Sold_in_Year))
if (nrow(Max_year)>1){
ID_list[j]<-Max_year[1,1]
Max_Sold_Year[j]<-Max_year[1,2]
Max_list[j]<-Max_year[1,3]
j<-j+1
}
else {
ID_list[j]<-Max_year[1,1]
Max_Sold_Year[j]<-Max_year[1,2]
Max_list[j]<-Max_year[1,3]
j<-j+1
}
}
#changing above list to dataframe
mm=length(ID_list)
df_test_list<- data.frame(seller_ID=numeric(mm), Max_sold_num_year=numeric(mm),Max_year=numeric(mm))
for (i in 1:mm){
df_test_list$seller_ID[i] <- ID_list[[i]]
df_test_list$Max_sold_num_year[i] <- Max_list[[i]]
df_test_list$Max_year[i] <- Max_Sold_Year[[i]]
}
however, due to subsetting each time and using for loop this approach is kind of slow for a large dataset. Do you have any suggestions on how I can improve my code? is there any other way that I can calculate the desired result without using for loop?
Thanks
Try this
library(dplyr)
seller_ID <- c(1,1,1,2,2,3,4,4,4,4,4)
Product_ID <- c(1000,1000,1005,1004,1005,1003,1010,
1000,1001,1019,1017)
year <- c(2015,2016,2015,2020,2020,2000,2000,2001,2001,2001,2005)
data<- data.frame(seller_ID,Product_ID,year)
data %>%
dplyr::count(seller_ID, year) %>%
dplyr::group_by(seller_ID) %>%
dplyr::filter(n == max(n)) %>%
dplyr::rename(Max_sold_num_year = n, Max_year = year)
#> # A tibble: 4 x 3
#> # Groups: seller_ID [4]
#> seller_ID Max_year Max_sold_num_year
#> <dbl> <dbl> <int>
#> 1 1 2015 2
#> 2 2 2020 2
#> 3 3 2000 1
#> 4 4 2001 3
And thanks to the comment by #yung_febreze this could be achieved even shorter with
data %>%
dplyr::count(seller_ID, year) %>%
dplyr::group_by(seller_ID) %>%
dplyr::top_n(1)
EDIT In case of duplicated maximum values one can add dplyr::top_n(1, wt = year) which filters for the latest (or maximum) year:
data %>%
dplyr::count(seller_ID, year) %>%
dplyr::group_by(seller_ID) %>%
dplyr::top_n(1, wt = n) %>%
dplyr::top_n(1, wt = year) %>%
dplyr::rename(Max_sold_num_year = n, Max_year = year)

Dplyr solution using slice and group

Ciao, Here is my replicating example.
a=c(1,2,3,4,5,6)
a1=c(15,17,17,16,14,15)
a2=c(0,0,1,1,1,0)
b=c(1,0,NA,NA,0,NA)
c=c(2010,2010,2010,2010,2010,2010)
d=c(1,1,0,1,0,NA)
e=c(2012,2012,2012,2012,2012,2012)
f=c(1,0,0,0,0,NA)
g=c(2014,2014,2014,2014,2014,2014)
h=c(1,1,0,1,0,NA)
i=c(2010,2012,2014,2012,2014,2014)
mydata = data.frame(a,a1,a2,b,c,d,e,f,g,h,i)
names(mydata) = c("id","age","gender","drop1","year1","drop2","year2","drop3","year3","drop4","year4")
mydata2 <- reshape(mydata, direction = "long", varying = list(c("year1","year2","year3","year4"), c("drop1","drop2","drop3","drop4")),v.names = c("year", "drop"), idvar = "X", timevar = "Year", times = c(1:4))
x1 = mydata2 %>%
group_by(id) %>%
slice(which(drop==1)[1])
x2 = mydata2 %>%
group_by(id) %>%
slice(which(drop==0)[1])
I have data "mydata2" which is tall such that every ID has many rows.
I want to make new data set "x" such that every ID has one row that is based on if they drop or not.
The first of drop1 drop2 drop3 drop4 that equals to 1, I want to take the year of that and put that in a variable dropYEAR. If none of drop1 drop2 drop3 drop4 equals to 1 I want to put the last data point in year1 year2 year3 year4 in the variable dropYEAR.
Ultimately every ID should have 1 row and I want to create 2 new columns: didDROP equals to 1 if the ID ever dropped or 0 if the ID did not ever drop. dropYEAR equals to the year of drop if didDROP equals to 1 or equals to the last reported year1 year2 year3 year4 if the ID did not ever drop. I try to do this in dplyr but this gives part of what I want only because it gets rid of ID values that equals to 0.
This is desired output, thank you to #Wimpel
First mydata2 %>% arrange(id) to understand the dataset, then using dplyr first and lastwe can pull the first year where drop==1 and the last year in case of drop never get 1 where drop is not null. Usingcase_when to check didDROP as it has a nice magic in dealing with NAs.
library(dplyr)
mydata2 %>% group_by(id) %>%
mutate(dropY=first(year[!is.na(drop) & drop==1]),
dropYEAR=if_else(is.na(dropY), last(year[!is.na(drop)]),dropY)) %>%
slice(1)
#Update
mydata2 %>% group_by(id) %>%
mutate(dropY=first(year[!is.na(drop) & drop==1]),
dropYEAR=if_else(is.na(dropY), last(year),dropY),
didDROP=case_when(any(drop==1) ~ 1, #Return 1 if there is any drop=1 o.w it will return 0
TRUE ~ 0)) %>%
select(-dropY) %>% slice(1)
# A tibble: 6 x 9
# Groups: id [6]
id age gender Year year drop X dropYEAR didDROP
<dbl> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 1 15 0 1 2010 1 1 2010 1
2 2 17 0 1 2010 0 2 2012 1
3 3 17 1 1 2010 NA 3 2014 0
4 4 16 1 1 2010 NA 4 2012 1
5 5 14 1 1 2010 0 5 2014 0
6 6 15 0 1 2010 NA 6 2014 0
I hope this what you're looking for.
You can sort by id, drop and year, conditionally on dropping or not:
library(dplyr)
mydata2 %>%
mutate(drop=ifelse(is.na(drop),0,drop)) %>%
arrange(id,-drop,year*(2*drop-1)) %>%
group_by(id) %>%
slice(1) %>%
select(id,age,gender,didDROP=drop,dropYEAR=year)
# A tibble: 6 x 5
# Groups: id [6]
id age gender didDROP dropYEAR
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 15 0 1 2010
2 2 17 0 1 2012
3 3 17 1 0 2014
4 4 16 1 1 2012
5 5 14 1 0 2014
6 6 15 0 0 2014

Resources