Subsetting observations with grouping some features - r

I have a dataset like below:
date, time,product,shop_id
20140104 900 Banana 18
20140104 900 Banana 19
20140104 924 Banana 18
20140104 929 Banana 18
20140104 932 Banana 20
20140104 948 Banana 18
and i need to extract the observations with different product, and different shop_id
so, i need to group the observations by product+shop_id
here is my code:
library(plyr)
d_ply( shop, .( product,shop_id ),table )
print(p)
unfortunately, it prints null
dataset:
date=c(20140104,20140104,20140104,20140104,20140104)
time=c(924 ,900,854,700,1450)
product=c(Banana ,Banana ,Banana ,Banana ,Banana)
shop_id=c(18,18,18,19,20)
shop<-data.frame(date=date,time=time,product=product,shop_id=shop_id)
the output should be
date, time, product, shop_id
20140104 900 Banana 19
20140104 932 Banana 20
20140104 948 Banana 18

We can do
library(tidyverse)
shop %>%
group_by(product, shop_id) %>%
mutate(n = n()) %>%
group_by(time) %>%
arrange(n) %>%
slice(1) %>%
group_by(product, shop_id) %>%
arrange(-time) %>%
slice(1) %>%
select(-n) %>%
arrange(time)
# date time product shop_id
# <int> <int> <chr> <int>
#1 20140104 900 Banana 19
#2 20140104 932 Banana 20
#3 20140104 948 Banana 18

In order to take only first unique combination, just use aggregate from package stats:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){x[1]})
Group.1 Group.2 date time product shop_id
1 Banana 18 20140104 924 Banana 18
2 Banana 19 20140104 700 Banana 19
3 Banana 20 20140104 1450 Banana 20
Explanation: My FUN=function(x){x[1]} takes only first element in case of collision
To drop "Group.1", "Group.2" or other columns:
> res <- aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){x[1]})
> res[ , !(names(res) %in% c("Group.1", "Group.2"))]
date time product shop_id
1 20140104 924 Banana 18
2 20140104 700 Banana 19
3 20140104 1450 Banana 20
P.S. Your dataset provided is inconsistent with examples you required, so that's why there is a difference in numbers.
P.S.2 If you want to get all data in case of collision:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN="identity")
Group.1 Group.2 date time product shop_id
1 Banana 18 20140104, 20140104, 20140104 924, 900, 854 1, 1, 1 18, 18, 18
2 Banana 19 20140104 700 1 19
3 Banana 20 20140104 1450 1 20
If you want to mark collisions:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){if (length(x) > 1) NA else x})
Group.1 Group.2 date time product shop_id
1 Banana 18 NA NA NA NA
2 Banana 19 20140104 700 1 19
3 Banana 20 20140104 1450 1 20
If you want to exclude non-unique rows:
> res <- aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){if (length(x) > 1) NULL else x})
> res[res$product != "NULL", !(names(res) %in% c("Group.1", "Group.2"))]
date time product shop_id
2 20140104 700 1 19
3 20140104 1450 1 20
If you want to avoid coerce from String to Int (for product), use ""/"NULL"/"NA" instead of NULL/NA.

It can be done using dplyr as follows:
# create the sample dataset
date=c(20140104,20140104,20140104,20140104,20140104)
time=c(924 ,900,854,700,1450)
product=c("Banana","Banana","Banana","Banana","Banana")
shop_id=c(18,18,18,19,20)
shop<-data.frame(date=date,time=time,product=product,shop_id=shop_id)
# load a dplyr library
library(dplyr)
# take shop data
shop %>%
# group by product, shop id, date
group_by(product, shop_id, date) %>%
# for each such combination, find the earliest time
summarise(time = min(time)) %>%
# group by product, shop id
group_by(product, shop_id) %>%
# for each combination of product & shop id
# return the earliest date and time recorded on the earliest date
summarise(date = min(date), time = time[date == min(date)])

Related

Remove duplicates based on multiple conditions

I have some individuals that are listed twice because they have received numerous degrees. I am trying to only get the rows with the latest degree granting date. Below are examples of the current output and the desired output
people | g_date | wage|quarter
personA|2009-01-01|100 |20201
personA|2009-01-01|100 |20202
personA|2010-01-01|100 |20201
personA|2010-01-01|100 |20202
personB|2012-01-01|50 |20201
personB|2012-01-01|50 |20202
personB|2012-01-01|50 |20203
Desired output
people | g_date | wage|quarter
personA|2010-01-01|100 |20201
personA|2010-01-01|100 |20202
personB|2012-01-01|50 |20201
personB|2012-01-01|50 |20202
personB|2012-01-01|50 |20203
I have used the code that is below but it is removing all the rows so that there is only one row per person.
df<-df[order(df$g_date),]
df<-df[!duplicated(df$people, fromLast = TRUE),]
Another option using group_by with ordered slice_max like this:
library(dplyr)
df %>%
group_by(people, quarter) %>%
slice_max(order_by = g_date, n = 1)
#> # A tibble: 5 × 4
#> # Groups: people, quarter [5]
#> people g_date wage quarter
#> <chr> <chr> <dbl> <int>
#> 1 personA 2010-01-01 100 20201
#> 2 personA 2010-01-01 100 20202
#> 3 personB 2012-01-01 50 20201
#> 4 personB 2012-01-01 50 20202
#> 5 personB 2012-01-01 50 20203
Created on 2022-12-15 with reprex v2.0.2
merge(df, aggregate(. ~ people, df[1:2], max))
#> people g_date wage quarter
#> 1 personA 2010-01-01 100 20201
#> 2 personA 2010-01-01 100 20202
#> 3 personB 2012-01-01 50 20201
#> 4 personB 2012-01-01 50 20202
#> 5 personB 2012-01-01 50 20203
Update (thanks to #Villalba, removed first answer):
We colud first group arrange and then filter:
library(dplyr)
library(lubridate)
df %>%
group_by(people, quarter) %>%
mutate(g_date = ymd(g_date)) %>%
arrange(g_date, .by_group = TRUE) %>%
filter(row_number()==n())
people g_date wage quarter
<chr> <date> <int> <int>
1 personA 2010-01-01 100 20201
2 personA 2010-01-01 100 20202
3 personB 2012-01-01 50 20201
4 personB 2012-01-01 50 20202
5 personB 2012-01-01 50 20203

R: Sumif equivalent from one dataframe to another

I have two dataframes that look somewhat like this (they're much bigger)
df1 <- data.frame(center = c("5012","5012","5025"),
product = c("Apple","Grape","Apple"),
value = c(20,30,50))
df1:
Center Product Value
1 5012 Apple 20
2 5012 Grape 30
3 5025 Apple 50
df2 <- data.frame(center = c("5012","5012","5012","5012","5012","5025"),
profitCenter = c("A","B","C","D","A","A"),
product = c("Apple","Apple","Apple", "Apple","Grape","Apple"),
volume = c(20,30,50,70,60,80))
df2:
Center Profitcenter Product Volume
1 5012 A Apple 20
2 5012 B Apple 30
3 5012 C Apple 50
4 5012 D Apple 70
5 5012 A Grape 60
6 5025 A Apple 80
I wanted to get the sum of the "volume" column from DF2 by "center" and "product" into DF1. In excel I would do a sumif, but I'm struggling to think on how to properly do this with R:
DF1:
Center Product Value Volume
1 5012 Apple 20 170
2 5012 Grape 30 60
3 5025 Apple 50 80
Currently I'm creating an aggregated version of DF2 (with dplyr's group_by) and then doing a left_join, but I have to do this a few more times and I'm sure there's a better way.
Similar to #MrFlick commented but I would prefer to first summarise in/works with df2 and then add the resulting column to df1. But outcome and performance are the same. As #r2evans pointed out:
library(tidyverse)
df2 %>%
group_by(center, product) %>%
summarise(Volume=sum(volume)) %>%
right_join(df1,by=c("center","product"))
Or:
library(tidyverse)
df1 %>%
left_join(
df2 %>%
group_by(center,product) %>%
summarise(sum(volume)),
by=c("center","product")
)
Or (for avoiding nesting, thanks #r2evans again):
library(tidyverse)
df2 %>%
group_by(center, product) %>%
summarise(Volume=sum(volume)) %>%
left_join(df1,.,by=c("center","product"))
Output:
center product value sum(volume)
1 5012 Apple 20 170
2 5012 Grape 30 60
3 5025 Apple 50 80

Joining two data frames using range of values

I have two data sets I would like to join. The income_range data is the master dataset and I would like to join data_occ to the income_range data based on what band the income falls inside. Where there are more than two observations(incomes) that are within the range I would like to take the lower income.
I was attempting to use data.table but was having trouble. I was would also like to keep all columns from both data.frames if possible.
The output dataset should only have 7 observations.
library(data.table)
library(dplyr)
income_range <- data.frame(id = "France"
,inc_lower = c(10, 21, 31, 41,51,61,71)
,inc_high = c(20, 30, 40, 50,60,70,80)
,perct = c(1,2,3,4,5,6,7))
data_occ <- data.frame(id = rep(c("France","Belgium"), each=50)
,income = sample(10:80, 50)
,occ = rep(c("manager","clerk","manual","skilled","office"), each=20))
setDT(income_range)
setDT(data_occ)
First attempt.
df2 <- income_range [data_occ ,
on = .(id, inc_lower <= income, inc_high >= income),
.(id, income, inc_lower,inc_high,perct,occ)]
Thank you in advance.
Since you tagged dplyr, here's one possible solution using that library:
library('fuzzyjoin')
# join dataframes on id == id, inc_lower <= income, inc_high >= income
joined <- income_range %>%
fuzzy_left_join(data_occ,
by = c('id' = 'id', 'inc_lower' = 'income', 'inc_high' = 'income'),
match_fun = list(`==`, `<=`, `>=`)) %>%
rename(id = id.x) %>%
select(-id.y)
# sort by income, and keep only the first row of every unique perct
result <- joined %>%
arrange(income) %>%
group_by(perct) %>%
slice(1)
And the (intermediate) results:
> head(joined)
id inc_lower inc_high perct income occ
1 France 10 20 1 10 manager
2 France 10 20 1 19 manager
3 France 10 20 1 14 manager
4 France 10 20 1 11 manager
5 France 10 20 1 17 manager
6 France 10 20 1 12 manager
> result
# A tibble: 7 x 6
# Groups: perct [7]
id inc_lower inc_high perct income occ
<chr> <dbl> <dbl> <dbl> <int> <chr>
1 France 10 20 1 10 manager
2 France 21 30 2 21 manual
3 France 31 40 3 31 manual
4 France 41 50 4 43 manager
5 France 51 60 5 51 clerk
6 France 61 70 6 61 manager
7 France 71 80 7 71 manager
I've added the intermediate dataframe joined for easy of understanding. You can omit it and just chain the two command chains together with %>%.
Here is one data.table approach:
cols = c("inc_lower", "inc_high")
data_occ[, (cols) := income]
result = data_occ[order(income)
][income_range,
on = .(id, inc_lower>=inc_lower, inc_high<=inc_high),
mult="first"]
data_occ[, (cols) := NULL]
# id income occ inc_lower inc_high perct
# 1: France 10 clerk 10 20 1
# 2: France 21 manager 21 30 2
# 3: France 31 clerk 31 40 3
# 4: France 41 clerk 41 50 4
# 5: France 51 clerk 51 60 5
# 6: France 62 manager 61 70 6
# 7: France 71 manager 71 80 7

aggregation of the region's values ​in the dataset

df <- read.csv ('https://raw.githubusercontent.com/ulklc/covid19-
timeseries/master/countryReport/raw/rawReport.csv',
stringsAsFactors = FALSE)
I processed the dataset.
Can we find the day of the least death in the Asian region?
the important thing here;
 is the sum of deaths of all countries in the asia region. Accordingly, it is to sort and find the day.
as output;
date region death
2020/02/17 asia 6300 (asia region sum)
The data in the output I created are examples. The data in the example are not real.
Since these are cumulative cases and deaths, we need to difference the data.
library(dplyr)
df %>%
mutate(day = as.Date(day)) %>%
filter(region=="Asia") %>%
group_by(day) %>%
summarise(deaths=sum(death)) %>%
mutate(d=c(first(deaths),diff(deaths))) %>%
arrange(d)
# A tibble: 107 x 3
day deaths d
<date> <int> <int>
1 2020-01-23 18 1 # <- this day saw only 1 death in the whole of Asia
2 2020-01-29 133 2
3 2020-02-21 2249 3
4 2020-02-12 1118 5
5 2020-01-24 26 8
6 2020-02-23 2465 10
7 2020-01-26 56 14
8 2020-01-25 42 16
9 2020-01-22 17 17
10 2020-01-27 82 26
# ... with 97 more rows
So the second day of records saw the least number of deaths recorded (so far).
Using the dplyr package for data treatment :
df <- read.csv ('https://raw.githubusercontent.com/ulklc/covid19-
timeseries/master/countryReport/raw/rawReport.csv',
stringsAsFactors = FALSE)
library(dplyr)
df_sum <- df %>% group_by(region,day) %>% # grouping by region and day
summarise(death=sum(death)) %>% # summing following the groups
filter(region=="Asia",death==min(death)) # keeping only minimum of Asia
Then you have :
> df_sum
# A tibble: 1 x 3
# Groups: region [1]
region day death
<fct> <fct> <int>
1 Asia 2020/01/22 17

R - Create new column with cumulative means by group

I have the following data frame, listing the spends for each category for each day
Dataframe: actualSpends
Date Category Spend ($)
2017/01/01 Apple 10
2017/01/02 Apple 12
2017/01/03 Apple 8
2017/01/01 Banana 13
2017/01/02 Banana 15
2017/01/03 Banana 7
I want to create a new data frame that will list down the average amount spend for each category, for each day of the month.
(e.g. On the 3rd of the month, the average spend of all days that have passed in the month, from the 1st to 31st of each month. )
EDIT:
So the output should look something like..
Date Category AvgSpend ($)
2017/01/01 Apple 10
2017/01/02 Apple 11
2017/01/03 Apple 10
2017/01/01 Banana 13
2017/01/02 Banana 14
2017/01/03 Banana 11.7
Where for each category, the average spend for each day is an average of all the days past. 1st, is an average of 1st. 2nd is an average of 1st + 2nd. 3rd is an average of 1st + 2nd + 3rd.
Is there any workaround for this?
We can use the cummean function from the dplyr package to calculate cumulative averages for each category; then melt the results into a new column:
library(dplyr)
library(reshape2)
unq <- unique(df$Category)
df$AvgSpend <- melt(
sapply(1:length(unq),
function(i) cummean(df$Spending[which(df$Category==unq[i])])))$value
Output:
Date Category Spending AvgSpend
1 2017/01/01 Apple 10 10.00000
2 2017/01/02 Apple 12 11.00000
3 2017/01/03 Apple 8 10.00000
4 2017/01/01 Banana 13 13.00000
5 2017/01/02 Banana 15 14.00000
6 2017/01/03 Banana 7 11.66667
Sample data:
df <- data.frame(Date=c("2017/01/01","2017/01/02","2017/01/03",
"2017/01/01","2017/01/02","2017/01/03"),
Category=c("Apple","Apple","Apple",
"Banana","Banana","Banana"),
Spending=c(10,12,8,13,15,7))
Here is a tidyverse option
library(tidyverse)
df %>%
group_by(Date, Category) %>%
summarise(Spending = mean(Spending, na.rm = TRUE))
# A tibble: 4 x 3
# Groups: Date [?]
# Date Category Spending
# <fctr> <fctr> <dbl>
#1 2017/01/01 Apple 11
#2 2017/01/02 Banana 14
#3 2017/01/03 Apple 8
#4 2017/01/03 Banana 7
You can use 'sqldf' (https://cran.r-project.org/web/packages/sqldf/sqldf.pdf) package
install.packages("sqldf")
library(sqldf)
actualSpends <- data.frame(
Date = c('2017/01/01','2017/01/02', '2017/01/03','2017/01/01','2017/01/02','2017/01/03'),
Category =('Apple','Apple','Apple','Banana','Banana','Banana'),
Spend = c(10,12,8,13,15,7))
sqldf("select Date,Category,sum(Spend) from actualSpends
group by Date,Category ")

Resources