Binning by Subgroup in R - r

I have a dataframe with Markets, Retailers and Sales. I need to bin the Retailers within each Market into 5 quantiles.
Example:
dataframe <- structure(list(Market = c(1, 1, 1, 2, 2, 2), Retailer = c(1,
2, 3, 4, 5, 6), Sales = c(5, 10, 25, 5, 10, 25), Quantile = c(1,
2, 3, 1, 2, 3)), class = "data.frame", row.names = c(NA, -6L))

One approach is using group_by and ntile from dplyr:
library(dplyr)
dataframe %>%
group_by(Market) %>%
mutate(Quantile = ntile(Sales, 4))
# A tibble: 150 x 4
# Groups: Market [3]
Market Retailer Sales Quantile
<int> <int> <dbl> <int>
1 1 1 16804 1
2 1 2 80752 4
3 1 3 38494 2
4 1 4 32773 2
5 1 5 60210 3
# … with 145 more rows
Data
set.seed(3)
dataframe <- data.frame(Market = rep(1:3, each = 50),
Retailer = rep(1:50, times = 3),
Sales = round(runif(150,0,100000),0))

Related

manipulate a pair data in R

I would like to reshape the data sample below, so that to get the output like in the table. How can I reach to that? the idea is to split the column e into two columns according to the disease. Those with disease 0 in one column and those with disease 1 in the other column. thanks in advance.
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), fid = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), disease = c(0, 1, 0, 1, 1, 0, 1, 0, 0,
1), e = c(3, 2, 6, 1, 2, 5, 2, 3, 1, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
library(tidyverse)
df %>%
pivot_wider(fid, names_from = disease, values_from = e, names_prefix = 'e') %>%
select(-fid)
e0 e1
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
if you want the e1,e2 you could do:
df %>%
pivot_wider(fid, names_from = disease, values_from = e,
names_glue = 'e{disease + 1}') %>%
select(-fid)
# A tibble: 5 x 2
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
We could use lead() combined with ìfelse statements for this:
library(dplyr)
df %>%
mutate(e2 = lead(e)) %>%
filter(row_number() %% 2 == 1) %>%
mutate(e1 = ifelse(disease==1, e2,e),
e2 = ifelse(disease==0, e2,e)) %>%
select(e1, e2)
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1

Calculating percent change between current value and value of a year ago R

Here is a sample of my dataframe:
A = data.frame(retailer = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store = c(5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6),
week = c(2021100301, 2021092601, 2021091901, 2021091201, 2021082901, 2021082201, 2021081501, 2021080801,
2020100101, 2020092501, 2020091801, 2020091101, 2020090401, 2020082701, 2020082001, 2020081301),
dollars = c(121817.9, 367566.7, 507674.5, 421257.8, 453330.3, 607551.4, 462674.8,
464329.1, 339342.3, 549271.5, 496720.1, 554858.7, 382675.5,
373210.9, 422534.2, 381668.6),
final_week = c("20211040", "20211039", "20211038", "20211037", "20210935", "20210934", "20210933", "20210832",
"20201040", "20201039", "20201038", "20201037", "20200935", "20200934", "20200933", "20200832"),
fill = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1))
I have grouped these by retailer, then by store, and then by descending final_week value within each group. How can I find the percent difference in dollars between the most recent 4 final_week values and their corresponding final_week values from a year ago within each store grouping? For example, the first value in the final_week column is 20211040, which refers to year 2021. The value from a year ago would be 20201040 (year 2020). I would like to find the percent difference between these values, doing this for the four most recent final_week values (the first four per store groups as I have arranged them in descending order).
I've started by doing the following:
A = A %>%
group_by(retailer, store) %>%
arrange(arrange(retailer, store, desc(final_week), by_group = TRUE) %>%
but am not sure where to go from here. Thanks!
I've slightly modified your example data to have some useful result (one store, both years), here a tidyverse solution:
library(dyplr)
library(tidyr)
A %>%
# add week and year
mutate(year_ = substr(final_week, 1,4),
week_ = substr(final_week,5,8)) %>%
# remove useless columns
select(-final_week,- week) %>%
# from long to wide, with columns for years
pivot_wider(names_from = year_,
values_from = c(dollars)) %>%
# get the last 4 weeks both years
top_n(wt = week_,4) %>%
# percentage
mutate(perc = (`2021`- `2020`)/ `2020`*100)
# A tibble: 4 x 7
retailer store fill week_ `2021` `2020` perc
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 2 5 1 1040 121818. 339342. -64.1
2 2 5 1 1039 367567. 549272. -33.1
3 2 5 1 1038 507674. 496720. 2.21
4 2 5 1 1037 421258. 554859. -24.1
EDIT:
Here a solution for years that can vary:
A %>%
mutate(year_ = ifelse(substr(final_week,1,4) == min(substr(final_week,1,4)), 'first_y', 'second_y'),
week_ = substr(final_week,5,8)) %>%
select(-final_week,- week) %>%
pivot_wider(names_from = year_,
values_from = c(dollars)) %>%
top_n(wt = week_,4) %>%
mutate(perc = (second_y - first_y)/ first_y*100)
# A tibble: 4 x 7
retailer store fill week_ second_y first_y perc
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 2 5 1 1040 121818. 339342. -64.1
2 2 5 1 1039 367567. 549272. -33.1
3 2 5 1 1038 507674. 496720. 2.21
4 2 5 1 1037 421258. 554859. -24.1
With data
A <- data.frame(retailer = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store = rep(5,16),
week = c(2021100301, 2021092601, 2021091901, 2021091201, 2021082901, 2021082201, 2021081501, 2021080801,
2020100101, 2020092501, 2020091801, 2020091101, 2020090401, 2020082701, 2020082001, 2020081301),
dollars = c(121817.9, 367566.7, 507674.5, 421257.8, 453330.3, 607551.4, 462674.8,
464329.1, 339342.3, 549271.5, 496720.1, 554858.7, 382675.5,
373210.9, 422534.2, 381668.6),
final_week = c("20211040", "20211039", "20211038", "20211037", "20210935", "20210934", "20210933", "20210832",
"20201040", "20201039", "20201038", "20201037", "20200935", "20200934", "20200933", "20200832"),
fill = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,1,1))

How to count the number of observations for a specific condition in R?

I have a dataset like this:
data <- data.frame(ID = c(1,1,1,1,1,2,2,2,2),
year = c(1,2,3,4,5,1,2,3,4),
score = c(0.89943475,-3.51761975,1.54511640,-1.38284380,2.45591240,-1.89925250,0.83935451,-0.61843636,-0.70421765)
ID, year, score
1, 1, 0.89943475
1, 2, -3.51761975
1, 3, 1.54511640
1, 4, -1.38284380
1, 5, 2.45591240
2, 1, -1.89925250
2, 2, 0.83935451
2, 3, -0.61843636
2, 4, -0.70421765
I want to create a data table which aggregates the above data and counts the number of observations for an ID when score is positive and negative, like this:
ID, pos, neg, total
1, 3, 2, 5
2, 1, 3, 4
Is this possible to do using data.table in R?
An alternative to akrun's answer:
data[, .(pos = sum(score >= 0), neg = sum(score < 0), total = .N), by = ID]
# ID pos neg total
# <num> <int> <int> <int>
# 1: 1 3 2 5
# 2: 2 1 3 4
Data
data <- setDT(structure(list(ID = c(1, 1, 1, 1, 1, 2, 2, 2, 2), year = c(1, 2, 3, 4, 5, 1, 2, 3, 4), score = c(0.89943475, -3.51761975, 1.5451164, -1.3828438, 2.4559124, -1.8992525, 0.83935451, -0.61843636, -0.70421765)), class = c("data.table", "data.frame"), row.names = c(NA, -9L)))
We could use dcast with sign
library(data.table)
dcast(setDT(data), ID ~ sign(score), fun.aggregate = length)[,
total := rowSums(.SD), .SDcols = -1][]
-output
ID -1 1 total
1: 1 2 3 5
2: 2 3 1 4

Calculate mean based on first part of row.name() in R

I have a data frame that looks likes this:
structure(list(value1 = c(1, 2, 3, 4, 5), value2 = c(1, 2, 2,
2, 2), value3 = c(1, 1, 2, 3, 4)), class = "data.frame", row.names = c("apple1",
"apple2", "orange1", "orange2", "plum"))
value1
value2
value3
apple1
1
1
1
apple2
2
2
1
orange1
3
2
2
orange2
4
2
3
plum
5
2
4
now I want to run the mean function on every column based on the first part of the row names
(for example I want to calculate the mean of value1 of the apple group independently from their apple number.)
I figured out that something like this works:
y<-x[grep("apple",row.names(x)),]
mean(y$value1)
mean(y$value2)
mean(y$vvalue3)
y<-x[grep("orange",row.names(x)),]
mean(y$value1)
mean(y$value2)
mean(y$value2)
y<-x[grep("plum",row.names(x)),]
mean(y$value1)
mean(y$value2)
mean(y$value2)
but for a bigger dataset, this is going to take ages, so I was wondering if there is a more efficient way to subset the data based on the first part of the row name and calculating the mean afterward.
Using tidyverse:
library(tidyverse)
df %>%
tibble::rownames_to_column("row") %>%
dplyr::mutate(row = str_remove(row, "\\d+")) %>%
dplyr::group_by(row) %>%
dplyr::summarize(across(where(is.numeric), ~ mean(.), .groups = "drop"))
In base R you could do:
df$row <- gsub("\\d+", "", rownames(df))
data.frame(do.call(cbind, lapply(df[,1:3], function(x) by(x, df$row, mean))))
Output
row value1 value2 value3
* <chr> <dbl> <dbl> <dbl>
1 apple 1.5 1.5 1
2 orange 3.5 2 2.5
3 plum 5 2 4
Data
df <- structure(list(value1 = 1:5, value2 = c(1, 2, 2, 2, 2), value3 = c(1,
1, 2, 3, 4)), class = "data.frame", row.names = c("apple1", "apple2",
"orange1", "orange2", "plum"))

Using group_by() to compute "grouped" ICC values

I'm trying to compute ICC values for each subject for the table below, but group_by() is not working as I think it should.
SubID Rate1 Rate2
1 1 2 5
2 1 2 4
3 1 2 5
4 2 3 4
5 2 4 1
6 2 5 1
7 2 2 2
8 3 2 5
9 3 3 5
The code I am running is as follows:
df %>%
group_by(SubID) %>%
summarise(icc = DescTools::ICC(.)$results[3, 2])
and the output:
# A tibble: 3 x 2
SubID icc
<dbl> <dbl>
1 1 -0.247
2 2 -0.247
3 3 -0.247
It seems that summarise is not being applied according to groups, but to the entire dataset. I'm not sure what is going on.
dput()
structure(list(SubID = c(1, 1, 1, 2, 2, 2, 2, 3, 3), Rate1 = c(2,
2, 2, 3, 4, 5, 2, 2, 3), Rate2 = c(5, 4, 5, 4, 1, 1, 2, 5, 5)), class = "data.frame", row.names = c(NA,
-9L))
Not terribly familiar with library(DescTools) but here is a potential solution that utilizes a nest() / map() combo:
library(DescTools)
library(tidyverse)
df <- structure(
list(SubID = c(1, 1, 1, 2, 2, 2, 2, 3, 3),
Rate1 = c(2, 2, 2, 3, 4, 5, 2, 2, 3),
Rate2 = c(5, 4, 5, 4, 1, 1, 2, 5, 5)),
class = "data.frame", row.names = c(NA, -9L)
)
df %>%
nest(ICC3 = -SubID) %>%
mutate(ICC3 = map_dbl(ICC3, ~ ICC(.x)[["results"]] %>%
filter(type == "ICC3") %>%
pull(est)))
#> # A tibble: 3 x 2
#> SubID ICC3
#> <dbl> <dbl>
#> 1 1 2.83e-15
#> 2 2 -5.45e- 1
#> 3 3 -6.66e-16
Created on 2021-03-08 by the reprex package (v0.3.0)

Resources