R dplyr - Sum values for different factors - r

I have multiple factors ("a","b","c") in my dataset, each with corresponding values for Price and Cost.
dat <- data.frame(
ProductCode = c("a", "a", "b", "b", "c", "c"),
Price = c(24, 37, 78, 45, 20, 34),
Cost = c(10,15,45,25,10,17)
)
I am looking for the sum of Price and Cost for each ProductCode.
by.code <- group_by(dat, code)
by.code <- summarise(by.code,
SumPrice = sum(Price),
SumCost = sum(Cost))
This code does not work as it sums all values in the column, without breaking them into categories.
SumPrice SumCost
1 238 122
Thanks in advance for your help.

This is not dplyr - This answer is for you if you dont mind the sqldf or data.table package:
sqldf("select ProductCode, sum(Price) as PriceSum, sum(Cost) as CostSum from dat group by ProductCode")
ProductCode PriceSum CostSum
a 61 25
b 123 70
c 54 27
OR using the data.table package:
library(data.table)
MM<-data.table(dat)
MM[, list(sum(Price),sum(Cost)), by = ProductCode]
ProductCode V1 V2
1: a 61 25
2: b 123 70
3: c 54 27

Your code works fine. There was just a typo. You should name your column ProductionCode into code and your code works fine. I just did that and R is giving proper output. Below is the code:
library(dplyr)
dat <- data.frame(
code = c("a", "a", "b", "b", "c", "c"),
Price = c(24, 37, 78, 45, 20, 34),
Cost = c(10,15,45,25,10,17)
)
dat
by.code <- group_by(dat, code)
by.code <- summarise(by.code,
SumPrice = sum(Price),
SumCost = sum(Cost))
by.code

We can use aggregate from base R
aggregate(.~ProductCode, dat, sum)
# ProductCode Price Cost
#1 a 61 25
#2 b 123 70
#3 c 54 27

Related

Compute the difference between two columns by pair in R

I have the following data:
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
I want to "extend" this data frame to make name pairs for every possible combination of names without repetition like so:
names_1 <- c("a", "a", "a", "b", "b", "c")
names_2 <- c("b", "c", "d", "c", "d", "d")
scores_1 <- c(95, 95, 95, 55, 55, 100)
scores_2 <- c(55, 100, 60, 100, 60, 60)
df_extended <- cbind.data.frame(names_1, names_2, scores_1, scores_2)
In the extended data, scores_1 are the scores for the corresponding name in names_1, and scores_2 are for names_2.
The following bit of code makes the appropriate name pairs. But I do not know how to get the scores in the right place after that.
t(combn(df$names,2))
The final goal is to get the row-wise difference between scores_1 and scores_2.
df_extended$score_diff <- abs(df_extended$scores_1 - df_extended$scores_2)
df_ext <- data.frame(t(combn(df$names, 2,\(x)c(x, df$scores[df$names %in%x]))))
df_ext <- setNames(type.convert(df_ext, as.is =TRUE), c('name_1','name_2', 'type_1', 'type_2'))
df_ext
name_1 name_2 type_1 type_2
1 a b 95 55
2 a c 95 100
3 a d 95 60
4 b c 55 100
5 b d 55 60
6 c d 100 60
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
library(tidyverse)
map(df, ~combn(x = .x, m = 2)%>% t %>% as_tibble) %>%
imap_dfc(~set_names(x = .x, nm = paste(.y, seq(ncol(.x)), sep = "_"))) %>%
mutate(score_diff = scores_1 - scores_2)
#> # A tibble: 6 × 5
#> names_1 names_2 scores_1 scores_2 score_diff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a b 95 55 40
#> 2 a c 95 100 -5
#> 3 a d 95 60 35
#> 4 b c 55 100 -45
#> 5 b d 55 60 -5
#> 6 c d 100 60 40
Created on 2022-06-06 by the reprex package (v2.0.1)
First, we can create a new data frame with the unique combinations of names. Then, we can merge on the scores to match the names for both names_1 and names_2 to get the final data.frame.
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
new_df <- data.frame(t(combn(df$names,2)))
names(new_df)[1] <- "names_1"; names(new_df)[2] <- "names_2"
new_df <- merge(new_df, df, by.x = 'names_1', by.y = 'names')
new_df <- merge(new_df, df, by.x = 'names_2', by.y = 'names')
names(new_df)[3] <- "scores_1"; names(new_df)[4] <- "scores_2"
> new_df
names_2 names_1 scores_1 scores_2
1 b a 95 55
2 c a 95 100
3 c b 55 100
4 d a 95 60
5 d b 55 60
6 d c 100 60

How to add a row, which is a sum of some values in a column based on specific values in other column?

I am trying to add two rows to the data frame.
Regarding the first row, its value in MODEL column should be X, total_value should be the sum of total value of rows, with the MODEL being A and C and total_frequency should be the sum of total_frequency of rows, with the MODEL being A and C.
In the second row, the value in MODEL column should be Z, total_value should be the sum of total_value of rows, with the MODEL being D, Fand E, and total_frequency should be the sum of total_frequency of rows, with the MODEL being D,Fand E.
I am stuck, as I do not know how to select specific values of MODEL and then sum these two other columns.
Here is my data
data.frame(MODEL=c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"), total_value= c(62, 54, 78, 38, 16, 75, 39, 13, 58, 37),
total_frequency = c(78, 83, 24, 13, 22, 52, 16, 16, 20, 72))
You can try with dplyr, calculating the "new rows", then put together with the data df:
library(dplyr)
first <- df %>%
# select the models you need
filter(MODEL %in% c("A","C")) %>%
# call them x
mutate(MODEL = 'X') %>%
# grouping
group_by(MODEL) %>%
# calculate the sums
summarise_all(sum)
# same with the second
second <- df %>%
filter(MODEL %in% c("D","F","E")) %>%
mutate(MODEL = 'Z') %>%
group_by(MODEL) %>% summarise_all(sum)
# put together
rbind(df, first, second)
# A tibble: 12 x 3
MODEL total_value total_frequency
1 A 62 78
2 B 54 83
3 C 78 24
4 D 38 13
5 E 16 22
6 F 75 52
7 G 39 16
8 H 13 16
9 I 58 20
10 J 37 72
11 X 140 102
12 Z 129 87
The following code is a straightforward solution to the problem.
i1 <- df1$MODEL %in% c("A", "C")
total_value <- sum(df1$total_value[i1])
total_frequency <- sum(df1$total_frequency[i1])
df1 <- rbind(df1, data.frame(MODEL = "X", total_value, total_frequency))
i2 <- df1$MODEL %in% c("D", "E", "F")
total_value <- sum(df1$total_value[i2])
total_frequency <- sum(df1$total_frequency[i2])
df1 <- rbind(df1, data.frame(MODEL = "Z", total_value, total_frequency))
df1
# MODEL total_value total_frequency
#1 A 62 78
#2 B 54 83
#3 C 78 24
#4 D 38 13
#5 E 16 22
#6 F 75 52
#7 G 39 16
#8 H 13 16
#9 I 58 20
#10 J 37 72
#11 X 140 102
#12 Z 129 87
It is also possible to write a function to avoid repeating the same code.
fun <- function(X, M, vals){
i1 <- X$MODEL %in% vals
total_value <- sum(X$total_value[i1])
total_frequency <- sum(X$total_frequency[i1])
rbind(X, data.frame(MODEL = M, total_value, total_frequency))
}
df1 <- fun(df1, M = "X", vals = c("A", "C"))
df1 <- fun(df1, M = "Z", vals = c("D", "E", "F"))

Compare and merge two dataframes

I have the following two dataframes in R:
df1 = data.frame(c("A", "A", "A", "B", "B"), c(1, 11, 21, 35, 45), c(6, 20, 30, 40, 60), c(1, 2, 3, 4, 5))
colnames(df1) = c("X", "Y", "Z", "score")
df1
X Y Z score
1 A 1 6 1
2 A 11 20 2
3 A 21 30 3
4 B 35 40 4
5 B 45 60 5
df2 = data.frame(c("A", "A", "A", "A", "B", "B", "B", "C"), c(1, 6, 21, 50, 20, 31, 50, 10), c(5, 20, 30, 60, 30, 40, 60, 20), c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"))
colnames(df2) = c("X", "Y", "Z", "out")
df2
X Y Z out
1 A 1 5 x1
2 A 6 20 x2
3 A 21 30 x3
4 A 50 60 x4
5 B 20 30 x5
6 B 31 40 x6
7 B 50 60 x7
8 C 10 20 x8
For every row in df1, I want to check:
is there a match with the value in 'X' and any other 'X' value from df2
if the above is true: I want to check if the values from 'Y' and 'Z' are in the range of the values 'Y' and 'Z' from df2
if both are true: then I want to add the value from 'out' to df1.
This is how the output should look like:
output = data.frame(c("A", "A", "A", "B", "B"), c(1, 11, 21, 35, 45), c(6, 20, 30, 40, 60), c(1, 2, 3, 4, 5), c("x1, x2", "x2", "x3", "x4", "x5"))
colnames(output) = c("X", "Y", "Z", "score", "out")
X Y Z score out
1 A 1 6 1 x1, x2
2 A 11 20 2 x2
3 A 21 30 3 x3
4 B 35 40 4 x6
5 B 45 60 5 x7
The original df1 is kept with an extra column 'out' that is added.
Line 1 from 'output', contains 'x1, x2' in column 'out'. Why: there is a match between the values in column 'X' and range 1 to 6 overlap with lines 1 and 2 from df2.
I've asked this question before (Compare values from two dataframes and merge) where it is suggested to use the foverlaps function. However because of the different columns between df1 and df2 and the extra rows in df2, I cannot make it work.
Here are two possible ways, a) using the newly implemented non equi joins feature, and b) foverlaps as you'd specifically mentioned that..
a) non-equi joins
dt2[dt1, on=.(X, Z>=Y, Y<=Z),
.(score, out=paste(out, collapse=",")),
by=.EACHI]
where dt1 and dt2 are data.tables corresponding to df1 and df2. Note that you'll have to revert column names Z and Y in the result (since the column names come from dt2 but the values from dt1.
Matching rows from dt2 corresponding to each row is dt1 is found based on the condition provided to the on argument and .() is evaluated for each of those matching rows (because of by=.EACHI).
b) foverlaps
setkey(dt1, X, Y, Z)
olaps <- foverlaps(dt2, dt1, type="any", nomatch=0L)
olaps[, .(score=score[1L], out=paste(out, collapse=",")), by=.(X,Y,Z)]
library(dplyr)
df1 = data.frame(c("A", "A", "A", "B", "B"), c(1, 11, 21, 35, 45),
c(6, 20, 30, 40, 60), c(1, 2, 3, 4, 5), stringsAsFactors = F)
colnames(df1) = c("X", "Y", "Z", "score")
df2 = data.frame(c("A", "A", "A", "A", "B", "B", "B", "C"), c(1, 6, 21, 50, 20, 31, 50, 10),
c(5, 20, 30, 60, 30, 40, 60, 20),
c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"), stringsAsFactors = F)
colnames(df2) = c("X", "Y", "Z", "out")
df1 %>%
left_join(df2, by="X") %>% # join on main column
rowwise() %>% # for each row
mutate(counter = sum(seq(Y.x, Z.x) %in% seq(Y.y, Z.y))) %>% # get how many elements of those ranges overlap
filter(counter > 0) %>% # keep rows with overlap
group_by(X, Y.x, Z.x, score) %>% # for each combination of those columns
summarise(out = paste(out, collapse=", ")) %>% # combine out column
ungroup() %>%
rename(Y = Y.x,
Z = Z.x)
# # A tibble: 5 × 5
# X Y Z score out
# <chr> <dbl> <dbl> <dbl> <chr>
# 1 A 1 6 1 x1, x2
# 2 A 11 20 2 x2
# 3 A 21 30 3 x3
# 4 B 35 40 4 x6
# 5 B 45 60 5 x7
The above process is based on dplyr package and involves a join and some grouping and filtering. If your initial datasets (df1, df2) are extremely large then the join will create an even bigger dataset that will need some time to be created.
Also, note that this process works with character and not factor variables. The process might convert factor variables to character if it tries to join factor variables with different levels.
I'd suggest you run the chained commands step by step to see how it works and spot if I missed anything that might lead to bugs in the code.
Here is another options using sqldf
library(sqldf)
xx=sqldf('select t1.*,t2.out from df1 t1 left join df2 t2 on t1.X=t2.X and ((t2.Y between t1.Y and t1.Z) or (t2.Z between t1.Y and t1.Z))')
aggregate(xx[ncol(xx)], xx[-ncol(xx)], FUN = function(X) paste(unique(X), collapse=", "))

Summation of variables by Groups in R

I have a data frame, and I'd like to create a new column that gives the sum of a numeric variable grouped by factors. So something like this:
BEFORE:
data1 <- data.frame(month = c(1, 1, 2, 2, 3, 3),
sex = c("m", "f", "m", "f", "m", "f"),
value = c(10, 20, 30, 40, 50, 60))
AFTER:
data2 <- data.frame(month = c(1, 1, 2, 2, 3, 3),
sex = c("m", "f", "m", "f", "m", "f"),
value = c(10, 20, 30, 40, 50, 60),
sum = c(30, 30, 70, 70, 110, 110))
In Stata you can do this with the egen command quite easily. I've tried the aggregate function, and the ddply function but they create entirely new data frames, and I just want to add a column to the existing one.
You are looking for ave
> data2 <- transform(data1, sum=ave(value, month, FUN=sum))
month sex value sum
1 1 m 10 30
2 1 f 20 30
3 2 m 30 70
4 2 f 40 70
5 3 m 50 110
6 3 f 60 110
data1$sum <- ave(data1$value, data1$month, FUN=sum) is useful if you don't want to use transform
Also data.table is helpful
library(data.table)
DT <- data.table(data1)
DT[, sum:=sum(value), by=month]
UPDATE
We can also use a tidyverse approach which is simple, yet elegant:
> library(tidyverse)
> data1 %>%
group_by(month) %>%
mutate(sum=sum(value))
# A tibble: 6 x 4
# Groups: month [3]
month sex value sum
<dbl> <fct> <dbl> <dbl>
1 1 m 10 30
2 1 f 20 30
3 2 m 30 70
4 2 f 40 70
5 3 m 50 110
6 3 f 60 110

Two by two matching between dataframes in r

I need to combine two data frames (df1 and df2) by matching up two site columns of each data frame to produce a third data frame (df3).
df1 = data.frame(Site.1=c("A","A","B"),
Site.2=c("B","C","C"),
Score1=c(60,70,80))
df1
Site.1 Site.2 Score1
1 A B 60
2 A C 70
3 B C 80
df2 = data.frame(Site.1=c("B","A","A"),
Site.2=c("C","B","C"),
Score2=c(10,20,30))
df2
Site.1 Site.2 Score2
1 B C 10
2 A B 20
3 A C 30
df3 = data.frame(Site.1=c("A","A","B"),
Site.2=c("B","C","C"),
Score1=c(60,70,80),
Score2=c(20,30,10))
df3
Site.1 Site.2 Score1 Score2
1 A B 60 20
2 A C 70 30
3 B C 80 10
You want the merge function. Since your column names that you want to match on already have the same name you don't even need to do anything special. If that wasn't the case you would want to look into the by.x and by.y parameters that merge takes.
df1 = data.frame(Site.1=c("A","A","B"),Site.2=c("B","C","C"),Score1=c(60,70,80))
df2 = data.frame(Site.1=c("B","A","A"),Site.2=c("C","B","C"), Score2=c(10,20,30))
df3 = data.frame(Site.1=c("A","A","B"),Site.2=c("B","C","C"), Score1=c(60,70,80),Score2=c(20,30,10))
df3
# Merge gives you what you want
merge(df1, df2)
dplyr may be helpful here.
library(dplyr)
df1 = data.frame(Site.1 = c("A", "A", "B"),
Site.2 = c("B", "C", "C"),
Score1 = c(60, 70, 80))
df2 = data.frame(Site.1 = c("B", "A", "A"),
Site.2 = c("C", "B", "C"),
Score2 = c(10, 20, 30))
inner_join(df1, df2)

Resources