Make connections between two datasets - r

I would like to make a connection between the x and df2 datasets. Notice that the dataset x, I have a percentage value, which in this case for the day 03-01-2021 is 0.1 and for the days 01-02-2021 and 01-01-2022 it is 0.45. So from that information, I know the percentage value for 03-01-2021 is 0.1, so this value falls into category I of my dataset df2 (since the values range from 0.1 to 0.2). As for the days 02-01-2021 and 01-01-2022, they correspond to category F of the df2,since the values range from 0.4 to 0.5. So, I would like to generate an output table as follows:
library(dplyr)
df1<- structure(
list(date2= c("01-01-2022","01-01-2022","03-01-2021","03-01-2021","01-02-2021","01-02-2021"),
Category= c("ABC","CDE","ABC","CDE","ABC","CDE"),
coef= c(5,4,0,2,4,5)),
class = "data.frame", row.names = c(NA, -6L))
x<-df1 %>%
group_by(date2) %>%
summarize(across("coef", sum),.groups = 'drop')%>%
arrange(date2 = as.Date(date2, format = "%d-%m-%Y"))
number<-20
x$Percentage<-x$coef/number
date2 coef Percentage
<chr> <dbl> <dbl>
1 03-01-2021 2 0.1
2 01-02-2021 9 0.45
3 01-01-2022 9 0.45
df2 <- structure(
list(
Category = c("A", "B", "C", "D",
"E", "F", "G", "H", "I", "J"),
From = c(0.9,
0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0),
Until = c(
1,
0.8999,
0.7999,
0.6999,
0.5999,
0.4999,
0.3999,
0.2999,
0.1999,
0.0999
),
`1 Val` = c(
2222,
2017.8,
1793.6,
1621.5,
1522.4,
1457.3,
1325.2,
1229.15,
1223.1,
1177.05
),
`2 Val` = c(3200, 2220, 2560,
2200, 2220, 2080, 1220, 1240, 1720, 1620),
`3 Val` = c(
4665,
4122.5,
3732,
3498.75,
3265.5,
3032.25,
2799,
2682.375,
2565.75,
2449.125
),
`4 Val` = c(
6112,
5222.8,
4889.6,
4224,
4278.4,
3972.8,
3667.2,
3224.4,
3361.6,
3222.8
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
Category From Until 1 Val 2 Val 3 Val 4 Val
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.9 1 2222 3200 4665 6112
2 B 0.8 0.900 2018 2220 4122 5223
3 C 0.7 0.800 1794 2560 3732 4890
4 D 0.6 0.700 1622 2200 3499 4224
5 E 0.5 0.600 1522 2220 3266 4278
6 F 0.4 0.500 1457 2080 3032 3973
7 G 0.3 0.400 1325 1220 2799 3667
8 H 0.2 0.300 1229 1240 2682 3224
9 I 0.1 0.200 1223 1720 2566 3362
10 J 0 0.0999 1177 1620 2449 3223

Using tidyverse, we do a rowwise on the 'x' dataset, slice the rows of 'df2' where the 'Percentage' falls between the 'From' and 'Until', and unpack the data.frame/tibble column
library(dplyr)
library(tidyr)
x %>%
rowwise %>%
mutate(out = df2 %>%
slice(which(Percentage>= From &
Percentage <= Until)[1]) %>%
select(-(1:3)) ) %>%
ungroup %>%
unpack(out)
-output
# A tibble: 3 × 7
date2 coef Percentage `1 Val` `2 Val` `3 Val` `4 Val`
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 03-01-2021 2 0.1 1223. 1720 2566. 3362.
2 01-02-2021 9 0.45 1457. 2080 3032. 3973.
3 01-01-2022 9 0.45 1457. 2080 3032. 3973.
Or this could be done with a non-equi join
library(data.table)
nm1 <- names(df2)[endsWith(names(df2), 'Val')]
setDT(x)[setDT(df2), (nm1) := mget(nm1),
on = .(Percentage >= From, Percentage <= Until)]
-output
> x
date2 coef Percentage 1 Val 2 Val 3 Val 4 Val
1: 03-01-2021 2 0.10 1223.1 1720 2565.75 3361.6
2: 01-02-2021 9 0.45 1457.3 2080 3032.25 3972.8
3: 01-01-2022 9 0.45 1457.3 2080 3032.25 3972.8

Related

Bring excel-table in tidy format

I have some struggles converting the following data (from an Excel-sheet) into a tidy format:
input <- structure(list(...11 = c(
NA, NA, "<1000", ">=1000 and <2000",
"2000", ">2000 and < 3000", ">=3000"
), ...13 = c(
"male", "female",
NA, NA, NA, NA, NA
), ...14 = c(
"<777", "<555", "0.3", "0.1",
"0.15", "0.13", "0.15"
), ...15 = c(
"888-999", "555-999", "0.23",
"0.21", "0", "0.21", "0.36"
), ...16 = c(
"556-899", "1020-1170",
"0.13", "0.29", "0.7", "0.8", "0.2"
), ...17 = c(
">960", ">11000",
"0.58", "0.31", "0.22", "0.65", "0.7"
)), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 7 × 6
...11 ...13 ...14 ...15 ...16 ...17
<chr> <chr> <chr> <chr> <chr> <chr>
1 NA male <777 888-999 556-899 >960
2 NA female <555 555-999 1020-1170 >11000
3 <1000 NA 0.3 0.23 0.13 0.58
4 >=1000 and <2000 NA 0.1 0.21 0.29 0.31
5 2000 NA 0.15 0 0.7 0.22
6 >2000 and < 3000 NA 0.13 0.21 0.8 0.65
7 >=3000 NA 0.15 0.36 0.2 0.7
I would like to bring it into the following structure:
output <- tibble::tribble(
~gender, ~x, ~y, ~share,
"male", "<777", "<1000", 0.3,
"female", "<555", "<1000", 0.3,
"male", "<777", ">=1000 and <2000", 0.1,
"female", "<555", ">=1000 and <2000", 0.1,
)
# A tibble: 4 × 4
gender x y share
<chr> <chr> <chr> <dbl>
1 male <777 <1000 0.3
2 female <555 <1000 0.3
3 male <777 >=1000 and <2000 0.1
4 female <555 >=1000 and <2000 0.1
Any hints are much appreciated!
As outlined in the comments, here's a suggested approach:
Import the excel sheet twice using readxl's read_excel using the skip argument:
library(readxl)
df1 <- read_excel(file = "yourfile.xlsx", skip = 2)
df2 <- read_excel(file = "yourfile.xlsx", skip = 1)
That should give you (note X1 might be called ...1):
df1 <- read_table("NA male <777 888-999 556-899 >960
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
df2 <- read_table("NA female <555 555-999 1020-1170 >11000
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
Then do a little wrangling; most importantly turn into a long format:
library(dplyr)
library(tidyr)
df1 <- df1 |>
select(-male) |>
rename(y = X1) |>
mutate(gender = "male") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
df2 <- df2 |>
select(-female) |>
rename(y = X1) |>
mutate(gender = "female") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
And voila, a tidy frame:
bind_rows(df1, df2) |> arrange(y)
Output:
# A tibble: 40 × 4
y gender x share
<chr> <chr> <chr> <dbl>
1 <1000 male <777 0.3
2 <1000 male 888-999 0.23
3 <1000 male 556-899 0.13
4 <1000 male >960 0.58
5 <1000 female <555 0.3
6 <1000 female 555-999 0.23
7 <1000 female 1020-1170 0.13
8 <1000 female >11000 0.58
9 >=1000and<2000 male <777 0.1
10 >=1000and<2000 male 888-999 0.21
# … with 30 more rows
It's a bit unclear, but I think you'd need to do something like this
df <- input[3:nrow(input),]
input <- input[1:2, 2:3]
t <- input[rep(1:nrow(input), nrow(df)),]
s <- df[rep(1:nrow(df), 2), ]
t <- cbind(t,s)
, and repeat as needed if you need this for multiple columns.

how to calculate the percentage between two rows based on a group in R

I have a datafrema with land use data of 6 points between the years 2005 to 2018. I would like to calculate the percentage change between 2005 to 2018.
df<-structure(list(place = c("F01", "F01", "F02", "F02", "F03", "F03",
"F04", "F04", "F05", "F05", "F06", "F06"), year = c(2005, 2018, 2005,
2018, 2005, 2018, 2005, 2018, 2005, 2018, 2005, 2018), Veg =
c(12281.5824712026, 12292.2267477317, 7254.98919713131,
7488.9138055415, 864.182200710528, 941.602680778032, 549.510775817472, 584.104674537216, 5577.10195081334, 5688.28474549675, 1244.96456185886, 1306.41862713264), Agri = c(113.178596532624, 1376.68748390712, 85.2373706436, 1048.71071335262, 0, 46.236076173504, 0, 46.236076173504, 85.2373706436, 1002.47463717912, 1.413692976528,
228.851945376768 ), Past = c(9190.16856517738, 7855.55923692456, 5029.33750161394, 3776.9718412309, 983.015569149264, 800.981808818688, 710.255983089744, 572.213021852304, 3726.66100294858, 2700.40306039963, 879.982298683488, 597.410020198656), Urb = c(146.026168634304, 200.910719487744, 146.026168634304,
200.910719487744, 141.119822421648, 194.840155529712, 141.119822421648, 194.840155529712, 4.906346212656, 6.070563958032, NA, NA), SoloExp = c(61.12143163224, 67.940421283728, 61.12143163224,
62.451966198384, 50.144521461552, 54.801392443056, 49.146620536944, 52.639273773072, 9.895850835696, 7.650573755328, 6.320039189184, 1.164217745376), Hidro = c(9.230583552624, 7.983207396864, 9.230583552624, 7.983207396864, NA, NA, NA, NA, 7.401098524176, 6.320039189184, 5.654771906112, 4.490554160736), total = c(691953.981181971, 691953.981181971, 691953.981181971,
691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971)), row.names = c(NA, -12L), class = "data.frame")
I tried using the lead command to calculate the difference between 2005 and 2018, but I was not successful:
df2<-df%>%
select(-c(total))%>%
replace(is.na(.), 0)%>%
pivot_longer(cols = c(3:8),
names_to = 'classe',
values_to = 'area')%>%
group_by(place, classe)%>%
mutate(percent=(((((area)-lead(area))/area)*100)*-1))%>%
pivot_wider(names_from = 'classe',
values_from = 'percent')%>%
select(-c(area, year))
For example for the Veg class I expected to get:
place Veg
F01 0.09
F02 3.22
F03 8.96
F04 6.30
F05 1.99
F06 4.94
Here is one solution if you need percentages for all the parameters
library(dplyr)
library(tidyr)
df_new<-df %>%
select(-(total))%>%
replace(is.na(.), 0)%>%
pivot_longer(cols = c(3:8),
names_to = 'classe',
values_to = 'area') %>%
pivot_wider(names_from=year, values_from=area) %>%
mutate(percent=(`2018`-`2005`)/`2005`) %>%
select(-`2018`,-`2005`) %>%
pivot_wider( names_from="classe", values_from="percent")
df_new
#> # A tibble: 6 × 7
#> place Veg Agri Past Urb SoloExp Hidro
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F01 0.000867 11.2 -0.145 0.376 0.112 -0.135
#> 2 F02 0.0322 11.3 -0.249 0.376 0.0218 -0.135
#> 3 F03 0.0896 Inf -0.185 0.381 0.0929 NaN
#> 4 F04 0.0630 Inf -0.194 0.381 0.0711 NaN
#> 5 F05 0.0199 10.8 -0.275 0.237 -0.227 -0.146
#> 6 F06 0.0494 161. -0.321 NaN -0.816 -0.206
Created on 2022-01-09 by the reprex package (v2.0.1)
Another possible solution:
library(tidyverse)
df %>%
group_by(place) %>%
summarise(across(-year, ~ 100*(last(.x) / first(.x) - 1)))
#> # A tibble: 6 × 8
#> place Veg Agri Past Urb SoloExp Hidro total
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F01 0.0867 1116. -14.5 37.6 11.2 -13.5 0
#> 2 F02 3.22 1130. -24.9 37.6 2.18 -13.5 0
#> 3 F03 8.96 Inf -18.5 38.1 9.29 NA 0
#> 4 F04 6.30 Inf -19.4 38.1 7.11 NA 0
#> 5 F05 1.99 1076. -27.5 23.7 -22.7 -14.6 0
#> 6 F06 4.94 16088. -32.1 NA -81.6 -20.6 0

calculate time difference in the same group

I want to convert the column time to be in time decimal format and then find the time interval within each group of the user_id. I have tried the answer below, but I could not get it to work:
Days difference between two dates in same column in R
structure(list(question_id = c(5502L, 5502L, 5502L, 5502L, 5502L
), user_id = c(112197L, 112197L, 112197L, 114033L, 114033L),
time = structure(c(1603720173, 1603720388, 1603720702, 1603603115,
1603949442), class = c("POSIXct", "POSIXt"), tzone = ""),
prediction = c(0.9, 0.95, 0.9, 0.99, 0.94), log_score = c(0.84799690655495,
0.925999418556223, 0.84799690655495, 0.985500430304885, 0.910732661902913
)), row.names = 156182:156186, class = "data.frame")
Perhaps this is what you're looking for?
library(dplyr)
user_data %>%
group_by(user_id) %>%
summarise(day.interval = difftime(max(time), min(time),units = "days"))
# A tibble: 2 x 2
user_id day.interval
<int> <drtn>
1 112197 0.006122685 days
2 114033 4.008414352 days
library(tidyverse)
library(lubridate)
df <- tibble::tribble(
~question_id, ~user_id, ~time, ~prediction, ~log_score,
5502L, 112197L, "2020-10-26 14:49:33", 0.9, 0.84799690655495,
5502L, 112197L, "2020-10-26 14:53:08", 0.95, 0.925999418556223,
5502L, 112197L, "2020-10-26 14:58:22", 0.9, 0.84799690655495,
5502L, 114033L, "2020-10-25 06:18:35", 0.99, 0.985500430304885,
5502L, 114033L, "2020-10-29 06:30:42", 0.94, 0.910732661902913
)
df %>%
as_tibble() %>%
mutate(time = lubridate::ymd_hms(time)) %>%
group_by(user_id) %>%
mutate(diff = time - lag(time),
diff2 = hms::hms(seconds_to_period(diff)))
#> # A tibble: 5 x 7
#> # Groups: user_id [2]
#> question_id user_id time prediction log_score diff diff2
#> <int> <int> <dttm> <dbl> <dbl> <drtn> <time>
#> 1 5502 112197 2020-10-26 14:49:33 0.9 0.848 NA secs NA
#> 2 5502 112197 2020-10-26 14:53:08 0.95 0.926 215 secs 00:03:35
#> 3 5502 112197 2020-10-26 14:58:22 0.9 0.848 314 secs 00:05:14
#> 4 5502 114033 2020-10-25 06:18:35 0.99 0.986 NA secs NA
#> 5 5502 114033 2020-10-29 06:30:42 0.94 0.911 346327 secs 96:12:07

Compute month on month difference in weights

I am working on some portfolio data and I'm stumped by this data manipulation. I have this sample data
df <- tibble(
date = as.Date(c("2020-01-31", "2020-01-31", "2020-01-31",
"2020-02-29", "2020-02-29", "2020-02-29",
"2020-03-31", "2020-03-31", "2020-03-31") ),
id = c("KO", "AAPL", "MSFT",
"KO", "AAPL", "GOOG",
"KO", "AAPL", "MSFT"),
weight = c(0.3, 0.4, 0.3,
0.5, 0.3, 0.2,
0.6, 0.2, 0.2),
`weight_change (desired column)` = c(NA, NA, NA,
0.2, -0.1, 0.2,
0.1, -0.1, 0.2)
)
These are the positions in a sample portfolio. The portfolio gets new weights every month. What I want to calculate is the change in weight for each item in terms of the previous months weight. In this example we see that at the end of February, KO's current weight is 0.5 which is up 0.2 from the previous month. AAPL is down 0.1, while GOOG replaces MSFT so the change with the previous month is its entire current weight: 0.2. How can I set up a mutate such that it looks for the stock in the previous date and calculates the difference between the weights?
If the data is monthly for each 'id', we can do a complete to take account of the missing months, then do a group by diff
library(dplyr)
library(tidyr)
library(zoo)
df %>%
mutate(yearmonth = as.Date(as.yearmon(date))) %>%
group_by(id) %>%
complete(yearmonth = seq(first(yearmonth), last(yearmonth), by = '1 month')) %>%
mutate(weight_change = if(n() == 1) weight else c(NA, diff(replace_na(weight, 0)))) %>%
ungroup %>%
select(names(df), weight_change) %>%
filter(!is.na(date))
# A tibble: 9 x 5
# date id weight `weight_change (desired column)` weight_change
# <date> <chr> <dbl> <dbl> <dbl>
#1 2020-01-31 AAPL 0.4 NA NA
#2 2020-02-29 AAPL 0.3 -0.1 -0.1
#3 2020-03-31 AAPL 0.2 -0.1 -0.100
#4 2020-02-29 GOOG 0.2 0.2 0.2
#5 2020-01-31 KO 0.3 NA NA
#6 2020-02-29 KO 0.5 0.2 0.2
#7 2020-03-31 KO 0.6 0.1 0.100
#8 2020-01-31 MSFT 0.3 NA NA
#9 2020-03-31 MSFT 0.2 0.2 0.2
Here is my not so compact solution. I just use some helper columns, which I leave in so that one can follow.
library(tidyverse)
library(lubridate)
df <- tibble(
date = c("2020-01-31", "2020-01-31", "2020-01-31",
"2020-02-29", "2020-02-29", "2020-02-29",
"2020-03-31", "2020-03-31", "2020-03-31"),
id = c("KO", "AAPL", "MSFT", "KO", "AAPL", "GOOG", "KO", "AAPL", "MSFT"),
weight = c(0.3, 0.4, 0.3, 0.5, 0.3, 0.2, 0.6, 0.2, 0.2),
`weight_change (desired_column)` = c(NA, NA, NA, 0.2, -0.1, 0.2, 0.1, -0.1, 0.2)
) %>% #new code starts here
mutate(
date = as_date(date),
date_ym = floor_date(date,
unit = "month"))%>%
group_by(id)%>%
arrange(date)%>%
mutate(id_n = row_number(),
prev_exist = case_when(lag(date_ym) == date_ym - months(1) ~ "immediate month", #if there is an immediate month
id_n == 1 & date != min(df$date)~ "new month", #if this is a new month
TRUE ~ "no immediate month"),
weight_change = case_when(prev_exist == "new month"~ weight,
prev_exist == "no immediate month" & id_n > 1~ weight,
TRUE ~ weight-lag(weight)),
date_ym = NULL,
id_n = NULL,
prev_exist = NULL)
A timetk approach:
library(timetk)
df %>%
mutate(Month = lubridate::floor_date(date, "month")) %>%
group_by(id) %>%
timetk::pad_by_time(.date_var = Month, .by="month") %>%
select(-Month) %>%
mutate(WC = if(n() == 1) weight else c(NA, diff(weight)))
A tibble: 10 x 5
Groups: id [4]
id date weight weight_change WC
<chr> <date> <dbl> <dbl> <dbl>
1 KO 2020-01-31 0.3 NA NA
2 KO 2020-02-29 0.5 0.2 0.2
3 KO 2020-03-31 0.6 0.1 0.100
4 AAPL 2020-01-31 0.4 NA NA
5 AAPL 2020-02-29 0.3 -0.1 -0.1
6 AAPL 2020-03-31 0.2 -0.1 -0.100
7 MSFT 2020-01-31 0.3 NA NA
8 MSFT NA NA NA NA
9 MSFT 2020-03-31 0.2 0.2 NA
10 GOOG 2020-02-29 0.2 0.2 0.2

Mutate top n rows without throwing away the other rows

I have the following data.frame below. I would like to create a new column w (for weight). w should equal 1 / n for the industries that have the n highest returns for each given date and should equal 0 for the rest of the industries. I can group_by(date) and use top_n(3, wt = return) to filter the top industries and then mutate(w = 1/n), but how can I mutate without throwing away the other industries where w = 0?
structure(list(date = structure(c(16556, 16556, 16556, 16556,
16556, 16556, 16556, 16556, 16556, 16556, 16587, 16587, 16587,
16587, 16587, 16587, 16587, 16587, 16587, 16587, 16617, 16617,
16617, 16617, 16617, 16617, 16617, 16617, 16617, 16617), class = "Date"),
industry = c("Hlth", "Txtls", "BusEq", "Fin", "ElcEq", "Food",
"Beer", "Books", "Cnstr", "Carry", "Clths", "Txtls", "Fin",
"Games", "Cnstr", "Meals", "Hlth", "Hshld", "Telcm", "Rtail",
"Smoke", "Games", "Clths", "Rtail", "Servs", "Meals", "Food",
"Hlth", "Beer", "Trans"), return = c(4.89, 4.37, 4.02, 2.99,
2.91, 2.03, 2, 1.95, 1.86, 1.75, 4.17, 4.09, 1.33, 1.26,
0.42, 0.29, 0.08, -0.11, -0.45, -0.48, 9.59, 6, 5.97, 5.78,
5.3, 4.15, 4.04, 3.67, 3.51, 3.27)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 30 x 3
date industry return
<date> <chr> <dbl>
1 2015-05-01 Hlth 4.89
2 2015-05-01 Txtls 4.37
3 2015-05-01 BusEq 4.02
4 2015-05-01 Fin 2.99
5 2015-05-01 ElcEq 2.91
6 2015-05-01 Food 2.03
7 2015-05-01 Beer 2
8 2015-05-01 Books 1.95
9 2015-05-01 Cnstr 1.86
10 2015-05-01 Carry 1.75
# ... with 20 more rows
EDIT: How would you handle ties? Suppose there is a tie for third place. The third place weight should be split between 3rd and 4th place (assuming only 2 are tied) with weights of (1/n)/2. The 1st and 2nd place weights stay at 1/n.
EDIT: Suppose n = 3. The top 3 A2 values for each A1 should get a weight w of 1/3 if there are no ties. If there is a tie for 3rd place (T3), then we have (1st, 2nd, T3, T3) and I would like weights to be 1/3, 1/3, 1/6, 1/6 to maintain a total weight of 1. This is only for 3rd place however. (1st, T2, T2) should have weights of 1/3, 1/3, 1/3. (T1, T1, T2, T2) should have weights of 1/3, 1/3, 1/6, 1/6, etc.
structure(list(A1 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), A2 = c(1, 3, 3,
4, 5, 6, 7, 8, 8)), row.names = c(NA, -9L), class = "data.frame")
The output for df should be:
> df
A1 A2 w
1 A 1 0
2 A 3 0.1666
3 A 3 0.1666
4 A 4 0.3333
5 A 5 0.3333
6 B 6 0
7 B 7 0.3333
8 B 8 0.3333
9 B 8 0.3333
We could create a condition with ifelse. After grouping by 'date', arrange the dataset based on the 'date', and 'return' in descending order, then create the 'w' by creating the condition that if the row_number() is less than 'n', then divide 'return' by 'n' or else return 0
n <- 3
df1 %>%
group_by(date) %>%
arrange(date, -return) %>%
mutate(w = ifelse(row_number() <= n, return/n, 0))
If we are using top_n, then create the column 'w' in the filtered dataset and join with the original
df1 %>%
group_by(date) %>%
top_n(return, n = 3) %>%
mutate(w = return/n()) %>%
right_join(df1) %>%
mutate(w = replace_na(w, 0))
We can group by date then sort the return variable get the last 3 enteries (top 3) and return return/n or else 0.
library(dplyr)
n <- 3
df %>%
group_by(date) %>%
mutate(w = ifelse(return %in% tail(sort(return), n), return/n, 0))
# date industry return w
# <date> <chr> <dbl> <dbl>
# 1 2015-05-01 Hlth 4.89 1.63
# 2 2015-05-01 Txtls 4.37 1.46
# 3 2015-05-01 BusEq 4.02 1.34
# 4 2015-05-01 Fin 2.99 0
# 5 2015-05-01 ElcEq 2.91 0
# 6 2015-05-01 Food 2.03 0
# 7 2015-05-01 Beer 2 0
#....
The base R equivalent of the same logic using ave
ave(df$return, df$date, FUN = function(x) ifelse(x %in% tail(sort(x), n), x/n, 0))
EDIT
As mentioned in comments, in case of ties OP wants to return (1/n)/2 or divide by number of ties we have.
For this I have created a new easier dataframe which makes it easy to understand what is going on.
df <- data.frame(A1 = rep(c("A", "B"),c(5, 4)), A2 = 1:9)
df$A2[2] <- 3
If we use the current code it gives
df %>%
group_by(A1) %>%
mutate(w = ifelse(A2 %in% tail(sort(A2), n), A2/n, 0))
# A tibble: 9 x 3
# Groups: A1 [2]
# A1 A2 w
# <fct> <int> <dbl>
#1 A 1 0
#2 A 3 1
#3 A 3 1
#4 A 4 1.33
#5 A 5 1.67
#6 B 6 0
#7 B 7 2.33
#8 B 8 2.67
#9 B 9 3
which is not what we want. To avoid that, we can group by A2 again and for only those rows where w!=0 we divide it by number of occurrences of A2.
df %>%
group_by(A1) %>%
mutate(w = ifelse(A2 %in% tail(sort(A2), n), A2/n, 0)) %>%
group_by(A2) %>%
mutate(w1 = ifelse(w != 0, w/n(), w)) %>%
ungroup()
# A1 A2 w w1
# <fct> <dbl> <dbl> <dbl>
#1 A 1 0 0
#2 A 3 1 0.5
#3 A 3 1 0.5
#4 A 4 1.33 1.33
#5 A 5 1.67 1.67
#6 B 6 0 0
#7 B 7 2.33 2.33
#8 B 8 2.67 2.67
#9 B 9 3 3
Another EDIT
Turns out we just want to divide w only for the last group present. Moreover, the sum of all the w in each group should sum up to 1. For the updated dataset we can do
n <- 3
temp_df <- df %>%
group_by(A1) %>%
top_n(n, A2)
temp_df %>%
arrange(A1, A2) %>%
mutate(w = ifelse(A2 == A2[1],
(1 - (1/n * sum(A2 != A2[1])))/sum(A2 == A2[1]), 1/n)) %>%
bind_rows(anti_join(df, temp_df) %>%
mutate(w = 0)
) %>%
arrange(A1, A2)
# A1 A2 w
# <fct> <dbl> <dbl>
#1 A 1 0
#2 A 3 0.167
#3 A 3 0.167
#4 A 4 0.333
#5 A 5 0.333
#6 B 6 0
#7 B 7 0.333
#8 B 8 0.333
#9 B 8 0.333
Let's try another variation where we keep all the values of the group same.
df1 = df
df1$A2[6:9] <- 10
temp_df <- df1 %>%
group_by(A1) %>%
top_n(n, A2)
temp_df %>%
arrange(A1, A2) %>%
mutate(w = ifelse(A2 == A2[1],
(1 - (1/n * sum(A2 != A2[1])))/sum(A2 == A2[1]), 1/n)) %>%
bind_rows(anti_join(df1, temp_df) %>%
mutate(w = 0)
) %>%
arrange(A1, A2)
# A1 A2 w
# <fct> <dbl> <dbl>
#1 A 1 0
#2 A 3 0.167
#3 A 3 0.167
#4 A 4 0.333
#5 A 5 0.333
#6 B 10 0.25
#7 B 10 0.25
#8 B 10 0.25
#9 B 10 0.25
The logic is we select the top 3 A2 values along with their groups using top_n. Using anti_join we get all the rows which are not in top 3 and assign a fixed weight w to them as 0. For the rows which are included in top 3 we get the last group rows and assign them the weight which is remaining after assigning the weights to non-last groups.

Resources