Consider the dataset:
example1 = data.frame("year"=c(1,1,3,4,1,2,3,4,1,2,3,4,5),
"household"=c(1,1,1,1,2,2,2,2,2,2,2,2,2),
"person"= c(1,1,1,1,1,1,1,1,2,2,2,2,2),
"expected income" = c(seq(140,260,10)),
"income" = c(seq(110,230,10)))
Just to have an idea person=1 is the father of the family and person=2 is the mother of the family, in the complete dataset there will be also the children, but it doesn't matter right now.
I need to calculate the ratio between column(4) "expected income" in year(i) and column(5)"income" in year (i+1).
Furthermore the ratio has to be done only when the "person" and "household" is the same.
for example it doesn't have to be calculated the ratio between col(4)-row(4) and col(5)-row(5) because they are two man of different household,
the same for col(5)-row(8) and col(5)-row(9) because they are two different person within the same household.
Instead of the ratio between the "expected income" and the "income" of two different people I need an NA.
It has to be done generically since it is just a semplification of a dataset with more than 60000 row.
It sounds like you need to group by household and person, then find the ratio of the expected income to the lead value of income:
library(tidyverse)
example1 %>%
group_by(person, household) %>%
mutate(ratio = expected.income / lead(income))
#> # A tibble: 13 x 6
#> # Groups: person, household [3]
#> year household person expected.income income ratio
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 140 110 1.17
#> 2 2 1 1 150 120 1.15
#> 3 3 1 1 160 130 1.14
#> 4 4 1 1 170 140 NA
#> 5 1 2 1 180 150 1.12
#> 6 2 2 1 190 160 1.12
#> 7 3 2 1 200 170 1.11
#> 8 4 2 1 210 180 NA
#> 9 1 2 2 220 190 1.1
#> 10 2 2 2 230 200 1.10
#> 11 3 2 2 240 210 1.09
#> 12 4 2 2 250 220 1.09
#> 13 5 2 2 260 230 NA
Created on 2022-05-11 by the reprex package (v2.0.1)
Is this what you are looking for:
library(dplyr)
example1 %>%
mutate(ratio = ifelse(person == household, expected.income/income, NA))
Output:
year household person expected.income income ratio
1 1 1 1 140 110 1.272727
2 1 1 1 150 120 1.250000
3 3 1 1 160 130 1.230769
4 4 1 1 170 140 1.214286
5 1 2 1 180 150 NA
6 2 2 1 190 160 NA
7 3 2 1 200 170 NA
8 4 2 1 210 180 NA
9 1 2 2 220 190 1.157895
10 2 2 2 230 200 1.150000
11 3 2 2 240 210 1.142857
12 4 2 2 250 220 1.136364
13 5 2 2 260 230 1.130435
First order by household, person and year. Then calculate the ratio and set all rations to NA where the next lien is not the next year or not the same household or not the same person.
. <- example1
. <- .[order(.$household, .$person, .$year),]
.$ratio <- .$expected.income / c(.$income[-1], NA)
is.na(.$ratio) <- (1 + .$year) != c(.$year[-1], NA) |
.$household != c(.$household[-1], NA) | .$person != c(.$person[-1], NA)
.
# year household person expected.income income ratio
#1 1 1 1 140 110 NA
#2 1 1 1 150 120 NA
#3 3 1 1 160 130 1.142857
#4 4 1 1 170 140 NA
#5 1 2 1 180 150 1.125000
#6 2 2 1 190 160 1.117647
#7 3 2 1 200 170 1.111111
#8 4 2 1 210 180 NA
#9 1 2 2 220 190 1.100000
#10 2 2 2 230 200 1.095238
#11 3 2 2 240 210 1.090909
#12 4 2 2 250 220 1.086957
#13 5 2 2 260 230 NA
Don't know if stating two times with year 1 is a typo, but it shows if the condition of next year is considered.
I have a tall data frame as such:
data = data.frame("id"=c(1,2,3,4,5,6,7,8,9,10),
"group"=c(1,1,2,1,2,2,2,2,1,2),
"type"=c(1,1,2,3,2,2,3,3,3,1),
"score1"=c(sample(1:4,10,r=T)),
"score2"=c(sample(1:4,10,r=T)),
"score3"=c(sample(1:4,10,r=T)),
"score4"=c(sample(1:4,10,r=T)),
"score5"=c(sample(1:4,10,r=T)),
"weight1"=c(173,109,136,189,186,146,173,102,178,174),
"weight2"=c(147,187,125,126,120,165,142,129,144,197),
"weight3"=c(103,192,102,159,128,179,195,193,135,145),
"weight4"=c(114,182,199,101,111,116,198,123,119,181),
"weight5"=c(159,125,104,171,166,154,197,124,180,154))
library(reshape2)
library(plyr)
data1 <- reshape(data, direction = "long",
varying = list(c(paste0("score",1:5)),c(paste0("weight",1:5))),
v.names = c("score","weight"),
idvar = "id", timevar = "count", times = c(1:5))
data1 <- data1[order(data1$id), ]
And what I want to create is a new data frame like so:
want = data.frame("score"=rep(1:4,6),
"group"=rep(1:2,12),
"type"=rep(1:3,8),
"weightedCOUNT"=NA) # how to calculate this? count(data1, score, wt = weight)
I am just not sure how to calculate weightedCOUNT which should apply the weights to the score variable so then it gives in column 'weightedCOUNT' a weighted count that is aggregated by score and group and type.
An option would be to melt (from data.table - which can take multiple measure patterns, and then grouped by 'group', 'type' get the count
library(data.table)
library(dplyr)
melt(setDT(data), measure = patterns('^score', "^weight"),
value.name = c("score", "weight")) %>%
group_by(group, type) %>%
count(score, wt = weight)
If we need to have a complete set of combinations
library(tidyr)
melt(setDT(data), measure = patterns('^score', "^weight"),
value.name = c("score", "weight")) %>%
group_by(group, type) %>%
ungroup %>%
complete(group, type, score, fill = list(n = 0))
If I understand correctly, weightedCOUNT is the sum of weights grouped by score, group, and type.
For the sake of completeness, I would like to show how the accepted solution would look like when implemented in pure base R and pure data.table syntax, resp.
Base R
The OP was almost there. He has already reshaped data from wide to long format for multiple value variables. Only the final aggregation step was missing:
data1 <- reshape(data, direction = "long",
varying = list(c(paste0("score",1:5)),c(paste0("weight",1:5))),
v.names = c("score","weight"),
idvar = "id", timevar = "count", times = c(1:5))
result <- aggregate(weight ~ score + group + type, data1, FUN = sum)
result
score group type weight
1 1 1 1 479
2 3 1 1 558
3 4 1 1 454
4 1 2 1 378
5 2 2 1 154
6 3 2 1 174
7 4 2 1 145
8 1 2 2 535
9 2 2 2 855
10 3 2 2 248
11 4 2 2 499
12 1 1 3 189
13 2 1 3 351
14 3 1 3 600
15 4 1 3 362
16 1 2 3 596
17 2 2 3 265
18 3 2 3 193
19 4 2 3 522
result can be reordered by
with(result, result[order(score, group, type), ])
score group type weight
1 1 1 1 479
12 1 1 3 189
4 1 2 1 378
8 1 2 2 535
16 1 2 3 596
13 2 1 3 351
5 2 2 1 154
9 2 2 2 855
17 2 2 3 265
2 3 1 1 558
14 3 1 3 600
6 3 2 1 174
10 3 2 2 248
18 3 2 3 193
3 4 1 1 454
15 4 1 3 362
7 4 2 1 145
11 4 2 2 499
19 4 2 3 522
data.table
As shown by akrun, melt() from the data.table package can be combined with dplyr. Alternatively, we can stay with the data.table syntax for aggregation:
library(data.table)
cols <- c("score", "weight") # to save typing
melt(setDT(data), measure = patterns(cols), value.name = cols)[
, .(weightedCOUNT = sum(weight)), keyby = .(score, group, type)]
score group type weightedCOUNT
1: 1 1 1 479
2: 1 1 3 189
3: 1 2 1 378
4: 1 2 2 535
5: 1 2 3 596
6: 2 1 3 351
7: 2 2 1 154
8: 2 2 2 855
9: 2 2 3 265
10: 3 1 1 558
11: 3 1 3 600
12: 3 2 1 174
13: 3 2 2 248
14: 3 2 3 193
15: 4 1 1 454
16: 4 1 3 362
17: 4 2 1 145
18: 4 2 2 499
19: 4 2 3 522
The keyby parameter is used for grouping and ordering the output in one step.
Completion of missing combinations of the grouping variables is also possible in data.table syntax using the cross join function CJ():
melt(setDT(data), measure = patterns(cols), value.name = cols)[
, .(weightedCOUNT = sum(weight)), keyby = .(score, group, type)][
CJ(score, group, type, unique = TRUE), on = .(score, group, type)][
is.na(weightedCOUNT), weightedCOUNT := 0][]
score group type weightedCOUNT
1: 1 1 1 479
2: 1 1 2 0
3: 1 1 3 189
4: 1 2 1 378
5: 1 2 2 535
6: 1 2 3 596
7: 2 1 1 0
8: 2 1 2 0
9: 2 1 3 351
10: 2 2 1 154
11: 2 2 2 855
12: 2 2 3 265
13: 3 1 1 558
14: 3 1 2 0
15: 3 1 3 600
16: 3 2 1 174
17: 3 2 2 248
18: 3 2 3 193
19: 4 1 1 454
20: 4 1 2 0
21: 4 1 3 362
22: 4 2 1 145
23: 4 2 2 499
24: 4 2 3 522
score group type weightedCOUNT
I have a question on how to add the value from a group to rest of the elements in the group then delete that row. for ex:
df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))
in the above example, my data is grouped by Year, Cluster, Seed and Day where seed=99 values need to be added to above rows based on (Year, Cluster and Day) group then delete this row. for ex: Row # 16, is part of (Year=1, Cluster=a,Day=1 and Seed=99) group and the value of Row #16 which is 55 should be added to Row #1 (5+55), Row # 6 (6+55) and Row # 11 (2+55) and row # 16 should be deleted. But when it comes to Row #21, which is in cluster=C with seed=99, should remain in the database as is as it cannot find any matching in year+cluster+day combination.
My actual data is of 1 million records with 10 years, 80 clusters, 500 days and 10+1 (1 to 10 and 99) seeds, so looking for so looking for an efficient solution.
Year Cluster Seed Day value
1 1 a 1 1 60
2 1 a 1 2 68
3 1 a 1 3 78
4 1 a 1 4 90
5 1 a 1 5 107
6 1 a 2 1 61
7 1 a 2 2 73
8 1 a 2 3 86
9 1 a 2 4 91
10 1 a 2 5 104
11 1 a 3 1 57
12 1 a 3 2 67
13 1 a 3 3 79
14 1 a 3 4 96
15 1 a 3 5 105
16 1 c 99 1 10
17 2 b 1 1 60
18 2 b 1 2 68
19 2 b 1 3 78
20 2 b 1 4 90
21 2 b 1 5 107
22 2 b 2 1 61
23 2 b 2 2 73
24 2 b 2 3 86
25 2 b 2 4 91
26 2 b 2 5 104
27 2 b 3 1 57
28 2 b 3 2 67
29 2 b 3 3 79
30 2 b 3 4 96
31 2 b 3 5 105
32 2 d 99 1 10
A data.table approach:
library(data.table)
df <- setDT(df)[, `:=` (value = ifelse(Seed != 99, value + value[Seed == 99], value),
flag = Seed == 99 & .N == 1), by = .(Year, Cluster, Day)][!(Seed == 99 & flag == FALSE),][, "flag" := NULL]
Output:
df[]
Year Cluster Seed Day value
1: 1 a 1 1 60
2: 1 a 1 2 68
3: 1 a 1 3 78
4: 1 a 1 4 90
5: 1 a 1 5 107
6: 1 a 2 1 61
7: 1 a 2 2 73
8: 1 a 2 3 86
9: 1 a 2 4 91
10: 1 a 2 5 104
11: 1 a 3 1 57
12: 1 a 3 2 67
13: 1 a 3 3 79
14: 1 a 3 4 96
15: 1 a 3 5 105
16: 1 c 99 1 10
17: 2 b 1 1 60
18: 2 b 1 2 68
19: 2 b 1 3 78
20: 2 b 1 4 90
21: 2 b 1 5 107
22: 2 b 2 1 61
23: 2 b 2 2 73
24: 2 b 2 3 86
25: 2 b 2 4 91
26: 2 b 2 5 104
27: 2 b 3 1 57
28: 2 b 3 2 67
29: 2 b 3 3 79
30: 2 b 3 4 96
31: 2 b 3 5 105
32: 2 d 99 1 10
Here's an approach using the tidyverse. If you're looking for speed with a million rows, a data.table solution will probably perform better.
library(tidyverse)
df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))
seeds <- df %>%
filter(Seed == 99)
matches <- df %>%
filter(Seed != 99) %>%
inner_join(select(seeds, -Seed), by = c("Year", "Cluster", "Day")) %>%
mutate(value = value.x + value.y) %>%
select(Year, Cluster, Seed, Day, value)
no_matches <- anti_join(seeds, matches, by = c("Year", "Cluster", "Day"))
bind_rows(matches, no_matches) %>%
arrange(Year, Cluster, Seed, Day)
#> Year Cluster Seed Day value
#> 1 1 a 1 1 60
#> 2 1 a 1 2 68
#> 3 1 a 1 3 78
#> 4 1 a 1 4 90
#> 5 1 a 1 5 107
#> 6 1 a 2 1 61
#> 7 1 a 2 2 73
#> 8 1 a 2 3 86
#> 9 1 a 2 4 91
#> 10 1 a 2 5 104
#> 11 1 a 3 1 57
#> 12 1 a 3 2 67
#> 13 1 a 3 3 79
#> 14 1 a 3 4 96
#> 15 1 a 3 5 105
#> 16 1 c 99 1 10
#> 17 2 b 1 1 60
#> 18 2 b 1 2 68
#> 19 2 b 1 3 78
#> 20 2 b 1 4 90
#> 21 2 b 1 5 107
#> 22 2 b 2 1 61
#> 23 2 b 2 2 73
#> 24 2 b 2 3 86
#> 25 2 b 2 4 91
#> 26 2 b 2 5 104
#> 27 2 b 3 1 57
#> 28 2 b 3 2 67
#> 29 2 b 3 3 79
#> 30 2 b 3 4 96
#> 31 2 b 3 5 105
#> 32 2 d 99 1 10
Created on 2018-11-23 by the reprex package (v0.2.1)