Sum by groups in two columns in R - r

I have the following DF:
DAY BRAND SOLD
2018/04/10 KIA 10
2018/04/15 KIA 5
2018/05/01 KIA 7
2018/05/06 KIA 3
2018/04/04 BMW 2
2018/05/25 BMW 8
2018/06/19 BMW 5
2018/06/14 BMW 1
I would like to sum the units sold by month and repeat them in every row where the date belongs to the month (the sum can't be done for different BRANDS in the same MONTH, that's a condition), like this:
DAY BRAND SOLD TOTAL
2018/04/10 KIA 10 15
2018/04/15 KIA 5 15
2018/05/01 KIA 7 10
2018/05/06 KIA 3 10
2018/04/04 BMW 2 2
2018/05/25 BMW 8 8
2018/06/19 BMW 5 6
2018/06/14 BMW 1 6
How can I do this?

We can use ave after extracting the 'month' from the 'DAY' column and use that as grouping variable along with "BRAND"
df1$TOTAL <- with(df1, ave(SOLD, BRAND,
format(as.Date(DAY, "%Y/%m/%d"), "%m"), FUN = sum))
df1$TOTAL
#[1] 15 15 10 10 2 8 6 6
Or in dplyr/lubridate
library(dplyr)
library(lubridate)
df1 %>%
group_by(BRAND, MONTH = month(ymd(DAY))) %>%
mutate(TOTAL = sum(SOLD))
# A tibble: 8 x 5
# Groups: BRAND, MONTH [5]
# DAY BRAND SOLD MONTH TOTAL
# <chr> <chr> <int> <dbl> <int>
#1 2018/04/10 KIA 10 4 15
#2 2018/04/15 KIA 5 4 15
#3 2018/05/01 KIA 7 5 10
#4 2018/05/06 KIA 3 5 10
#5 2018/04/04 BMW 2 4 2
#6 2018/05/25 BMW 8 5 8
#7 2018/06/19 BMW 5 6 6
#8 2018/06/14 BMW 1 6 6
Remove the 'MONTH' column after ungrouping with select(-MONTH) if needed
data
df1 <- structure(list(DAY = c("2018/04/10", "2018/04/15", "2018/05/01",
"2018/05/06", "2018/04/04", "2018/05/25", "2018/06/19", "2018/06/14"
), BRAND = c("KIA", "KIA", "KIA", "KIA", "BMW", "BMW", "BMW",
"BMW"), SOLD = c(10L, 5L, 7L, 3L, 2L, 8L, 5L, 1L)),
class = "data.frame", row.names = c(NA,
-8L))

Related

R Overwrite column values with non NA values from column in separate dataframe

I have a dataframe 'df1' with a lot of columns, but the ones of interest are:
Number
Code
1
2
3
10
11
AMRO
4
277
2100
BLPH
And I have another dataframe 'df2' with a lot of columns, but the ones of interest are:
Number
Code
1
AMCR
2
AMCR
3
BANO
10
BAEA
12
AMRO
4
NA
277
NA
2100
NA
I want matching values in the 'Number' columns of 'df1' and 'df2' to lead to values in the 'Code' column in 'df2' to overwrite the 'Code' values in 'df1' as long as the 'Code' values in 'df2' don't contain an NA, so that the final result of 'df1' looks like:
Number
Code
1
AMCR
2
AMCR
3
BANO
10
BAEA
11
AMRO
4
277
2100
BLPH
Thank you for your help!
We can do
library(powerjoin)
power_left_join(df1, df2, by = "Number", conflict = coalesce)
-output
Number Code
1 1 AMCR
2 2 AMCR
3 3 BANO
4 10 BAEA
5 11 AMRO
6 4 <NA>
7 277 <NA>
8 2100 BLPH
Or to do an overwrite, use data.table
library(data.table)
setDT(df1)[df2, Code := fcoalesce(Code, i.Code), on = .(Number)]
-output
> df1
Number Code
<int> <char>
1: 1 AMCR
2: 2 AMCR
3: 3 BANO
4: 10 BAEA
5: 11 AMRO
6: 4 <NA>
7: 277 <NA>
8: 2100 BLPH
data
df1 <- structure(list(Number = c(1L, 2L, 3L, 10L, 11L, 4L, 277L, 2100L
), Code = c(NA, NA, NA, NA, "AMRO", NA, NA, "BLPH")),
class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(Number = c(1L, 2L, 3L, 10L, 12L, 4L, 277L, 2100L
), Code = c("AMCR", "AMCR", "BANO", "BAEA", "AMRO", NA, NA, NA
)), class = "data.frame", row.names = c(NA, -8L))
Here is an alternative approach using bind_cols:
library(dplyr)
bind_cols(df1, df2) %>%
mutate(Code = coalesce(Code...2, Code...4)) %>%
select(Number = Number...1, Code)
Number Code
1 1 AMCR
2 2 AMCR
3 3 BANO
4 10 BAEA
5 11 AMRO
6 4 <NA>
7 277 <NA>
8 2100 BLPH
Here is a solution playing with dplyr full_join and inner_join
library(dplyr)
df1 %>%
full_join(df2) %>% na.omit() %>%
full_join(df1 %>% inner_join(df2)) %>%
filter(Number %in% df1$Number) %>%
arrange(Number)
Output
#> Number Code
#> 1 1 AMCR
#> 2 2 AMCR
#> 3 3 BANO
#> 4 4 <NA>
#> 5 10 BAEA
#> 6 11 AMRO
#> 7 277 <NA>
#> 8 2100 BLPH

How to use R to replace missing values with the sum of previous 4 values in a column?

I have a dataframe that contains (among other things) three columns that have missing values every 5 rows. These missing values need to be replaced with the sum of the previous 4 values in their respective column.
For example, let's say my dataframe looked like this:
id category1 category2 category3
123 5 10 10
123 6 11 15
123 6 12 23
123 4 10 6
123 NA NA NA
567 24 17 15
Those NAs need to represent a "total" based on the sum of the previous 4 values in their column, and this needs to repeat throughout the entire dataframe because the NAs occur every 5 rows. For instance, the three NAs in the mock example above should be replaced with 21, 43, and 54. 5 rows later, the same process will need to be repeated. How can I achieve this?
Another possible solution:
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(everything(), ~ if_else(is.na(.x), sum(.x, na.rm = T), .x))) %>%
ungroup
#> # A tibble: 6 × 4
#> id category1 category2 category3
#> <int> <int> <int> <int>
#> 1 123 5 10 10
#> 2 123 6 11 15
#> 3 123 6 12 23
#> 4 123 4 10 6
#> 5 123 21 43 54
#> 6 567 24 17 15
The following should work if there are no occurrences of NA values within the first 4 rows and I am assuming that the NA values appear in all columns at the same time.
for(i in 1:nrow(data)){
if(is.na(data[i, 2])){
data[i, 2] <- sum(data[seq(i-5, i-1), 2])
data[i, 3] <- sum(data[seq(i-5, i-1), 3])
data[i, 4] <- sum(data[seq(i-5, i-1), 4])
}
}
If the NAs appear at the end row for each 'id', we may remove it and do a group by summarise to create a row
library(dplyr)
df1 <- df1 %>%
na.omit %>%
group_by(id) %>%
summarise(across(everything(), ~ c(.x, sum(.x))), .groups = 'drop')
-output
df1
# A tibble: 7 × 4
id category1 category2 category3
<int> <int> <int> <int>
1 123 5 10 10
2 123 6 11 15
3 123 6 12 23
4 123 4 10 6
5 123 21 43 54
6 567 24 17 15
7 567 24 17 15
Or another approach would be to replace the NA with the sum using na.aggregate from zoo
library(zoo)
df1 %>%
group_by(id) %>%
mutate(across(everything(), na.aggregate, FUN = sum)) %>%
ungroup
# A tibble: 6 × 4
id category1 category2 category3
<int> <int> <int> <int>
1 123 5 10 10
2 123 6 11 15
3 123 6 12 23
4 123 4 10 6
5 123 21 43 54
6 567 24 17 15
data
df1 <- structure(list(id = c(123L, 123L, 123L, 123L, 123L, 567L),
category1 = c(5L,
6L, 6L, 4L, NA, 24L), category2 = c(10L, 11L, 12L, 10L, NA, 17L
), category3 = c(10L, 15L, 23L, 6L, NA, 15L)),
class = "data.frame", row.names = c(NA,
-6L))

Converting days into weeks in R

I want to convert days into weeks with all the values from that week summed up
Right now I have the following df
Date x
1 2018-02-23 15
2 2018-03-26 4
3 2018-03-29 3
4 2018-03-30 6
5 2018-04-03 5
6 2018-04-04 12
7 2018-04-05 7
8 2018-04-06 5
9 2018-04-07 5
10 2018-04-09 13
11 2018-04-10 8
12 2018-04-11 2
ETC.
The x in this df stands for amount of items sent on a certain day.
There are days in this df where there are no items beeing transported.
This df has a total of 688 tuples.
What I would like to see it:
Date x
1 Week 8 2018 19
2 Week 9 2018 26
3 Week 10 2018 33
ETC.
Can someone help me out?
You can use aggregate and get the weeks with format %V:
aggregate(df$x, list(Date=format(df$Date, "%V %Y")), sum)
# Date x
#1 08 2018 15
#2 13 2018 13
#3 14 2018 34
#4 15 2018 23
Or with Week (Thanks to #sindri-baldur for the comment):
aggregate(df$x, list(Date=sub("^0?", "Week ", format(df$Date, "%V %Y"))), sum)
#aggregate(df$x, list(Date=format(df$Date, "Week %-V %Y")), sum) #Alternative
# Date x
#1 Week 13 2018 13
#2 Week 14 2018 34
#3 Week 15 2018 23
#4 Week 8 2018 15
Data:
df <- read.table(header=TRUE, text=" Date x
1 2018-02-23 15
2 2018-03-26 4
3 2018-03-29 3
4 2018-03-30 6
5 2018-04-03 5
6 2018-04-04 12
7 2018-04-05 7
8 2018-04-06 5
9 2018-04-07 5
10 2018-04-09 13
11 2018-04-10 8
12 2018-04-11 2")
df$Date <- as.Date(df$Date)
library(lubridate)
library(tidyverse)
## Random data
df <- data.frame(date=seq.Date(from = as.Date("2018-01-01"), to=as.Date("2018-12-31"), by = "day"),x=runif(n=365,min=0,max=25))
## Aggregating by week
df2 <- df %>%
mutate(week = lubridate::week(ymd(date))) %>%
group_by(week) %>%
summarise(total_per_week = sum(x))
Using collapse
library(collapse)
library(lubridate)
library(magrittr)
df %>%
ftransform(week = week(ymd(Date))) %>%
fgroup_by(week) %>%
fsummarise(total_per_week = fsum(x))
# week total_per_week
#1 8 15
#2 13 13
#3 14 34
#4 15 23
data
df <- structure(list(Date = c("2018-02-23", "2018-03-26", "2018-03-29",
"2018-03-30", "2018-04-03", "2018-04-04", "2018-04-05", "2018-04-06",
"2018-04-07", "2018-04-09", "2018-04-10", "2018-04-11"), x = c(15L,
4L, 3L, 6L, 5L, 12L, 7L, 5L, 5L, 13L, 8L, 2L)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
#akrun
This almost worked. Right now I get 52 rows out of 3 years of data:
week total_per_week
1 1 246
2 2 491
3 3 458
4 4 556
5 5 454
6 6 594
7 7 592
8 8 496
9 9 567
10 10 615

Randomly assigning columns to other columns in R

I have a column of student names and a column consisting the group number for each of those students. How could I randomly assign each student to be a judge of another group's work, could anyone let me know on how to build a function to solve that issue? They cannot be a judge of their own group.
Bob Ross 1
Kanye West 1
Chris Evans 1
Robert Jr 1
Bruce Wayne 2
Peter Parker 2
Steven Strange 2
Danny rand 2
Daniel Fisher 2
Rob Son 3
Son Bob 3
Chun Li 3
Ching Do 3
Ping Pong 3
Michael Jackson 4
Rich Brian 4
Ryan Gosling 4
Nathan Nguyen 4
Justin Bieber 4
Here's one way, using tidyverse methods. Basically this says for each value (map_int) in group, take a sample from the groups that aren't the current one.
library(tidyverse)
df <- structure(list(name = c("Kanye West", "Chris Evans", "Robert Jr", "Bruce Wayne", "Peter Parker", "Steven Strange", "Danny rand", "Daniel Fisher", "Rob Son", "Son Bob", "Chun Li", "Ching Do", "Ping Pong", "Michael Jackson", "Rich Brian", "Ryan Gosling", "Nathan Nguyen", "Justin Bieber"), group = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -18L))
df %>%
mutate(
to_judge = map_int(
.x = group,
.f = ~ sample(
x = unique(group)[unique(group) != .x],
size = 1
)
)
)
#> # A tibble: 18 x 3
#> name group to_judge
#> <chr> <int> <int>
#> 1 Kanye West 1 4
#> 2 Chris Evans 1 2
#> 3 Robert Jr 1 3
#> 4 Bruce Wayne 2 1
#> 5 Peter Parker 2 3
#> 6 Steven Strange 2 3
#> 7 Danny rand 2 4
#> 8 Daniel Fisher 2 1
#> 9 Rob Son 3 1
#> 10 Son Bob 3 2
#> 11 Chun Li 3 4
#> 12 Ching Do 3 4
#> 13 Ping Pong 3 4
#> 14 Michael Jackson 4 2
#> 15 Rich Brian 4 3
#> 16 Ryan Gosling 4 1
#> 17 Nathan Nguyen 4 2
#> 18 Justin Bieber 4 1
Created on 2018-09-20 by the reprex package (v0.2.0).
Another option with tidyverse would be to group_by the group column, define the sample vector with setdiff and draw a sample of the size of the group:
df <- data.frame(Student = LETTERS[1:20],
Group = gl(4, 5))
library(tidyverse)
df %>%
group_by(Group) %>%
mutate(Judge = sample(setdiff(unique(df$Group), Group), n(), replace = T))
# A tibble: 20 x 3
# Groups: Group [4]
Student Group Judge
<fct> <fct> <chr>
1 A 1 4
2 B 1 2
3 C 1 3
4 D 1 3
5 E 1 4
6 F 2 4
7 G 2 4
8 H 2 1
9 I 2 1
10 J 2 4
11 K 3 4
12 L 3 2
13 M 3 1
14 N 3 2
15 O 3 2
16 P 4 2
17 Q 4 1
18 R 4 2
19 S 4 1
20 T 4 3

Match and Remove Rows Based on Condition R [duplicate]

This question already has answers here:
Select the row with the maximum value in each group
(19 answers)
Closed 2 years ago.
I've got an interesting one for you all.
I'm looking to first: Look through the ID column and identify duplicate values. Once those are identified, the code should go through the income of the duplicated values and keep the row with the larger income.
So if there are three ID values of 2, it will look for the one with the highest income and keep that row.
ID Income
1 98765
2 3456
2 67
2 5498
5 23
6 98
7 5645
7 67871
9 983754
10 982
10 2374
10 875
10 4744
11 6853
I know its as easy as subsetting based on a condition, but I don't know how to remove the rows based on if the income in one cell is greater than the other.(Only done if the id's match)
I was thinking of using an ifelse statement to create a new column to identify duplicates (through subsetting or not) then use the new column's values to ifelse again to identify the larger income. From there I can just subset based on the new columns I have created.
Is there a faster, more efficient way of doing this?
The outcome should look like this.
ID Income
1 98765
2 5498
5 23
6 98
7 67871
9 983754
10 4744
11 6853
Thank you
We can slice the rows by checking the highest value in 'Income' grouped by 'ID'
library(dplyr)
df1 %>%
group_by(ID) %>%
slice(which.max(Income))
Or using data.table
library(data.table)
setDT(df1)[, .SD[which.max(Income)], by = ID]
Or with base R
df1[with(df1, ave(Income, ID, FUN = max) == Income),]
# ID Income
#1 1 98765
#4 2 5498
#5 5 23
#6 6 98
#8 7 67871
#9 9 983754
#13 10 4744
#14 11 6853
data
df1 <- structure(list(ID = c(1L, 2L, 2L, 2L, 5L, 6L, 7L, 7L, 9L, 10L,
10L, 10L, 10L, 11L), Income = c(98765L, 3456L, 67L, 5498L, 23L,
98L, 5645L, 67871L, 983754L, 982L, 2374L, 875L, 4744L, 6853L)),
class = "data.frame", row.names = c(NA,
-14L))
order with duplicated( Base R)
df=df[order(df$ID,-df$Income),]
df[!duplicated(df$ID),]
ID Income
1 1 98765
4 2 5498
5 5 23
6 6 98
8 7 67871
9 9 983754
13 10 4744
14 11 6853
Here is another dplyr method. We can arrange the column and then slice the data frame for the first row.
library(dplyr)
df2 <- df %>%
arrange(ID, desc(Income)) %>%
group_by(ID) %>%
slice(1) %>%
ungroup()
df2
# # A tibble: 8 x 2
# ID Income
# <int> <int>
# 1 1 98765
# 2 2 5498
# 3 5 23
# 4 6 98
# 5 7 67871
# 6 9 983754
# 7 10 4744
# 8 11 6853
DATA
df <- read.table(text = "ID Income
1 98765
2 3456
2 67
2 5498
5 23
6 98
7 5645
7 67871
9 983754
10 982
10 2374
10 875
10 4744
11 6853",
header = TRUE)
Group_by and summarise from dplyr would work too
df1 %>%
group_by(ID) %>%
summarise(Income=max(Income))
ID Income
<int> <dbl>
1 1 98765.
2 2 5498.
3 5 23.
4 6 98.
5 7 67871.
6 9 983754.
7 10 4744.
8 11 6853.
Using sqldf: Group by ID and select the corresponding max Income
library(sqldf)
sqldf("select ID,max(Income) from df group by ID")
Output:
ID max(Income)
1 1 98765
2 2 5498
3 5 23
4 6 98
5 7 67871
6 9 983754
7 10 4744
8 11 6853

Resources