Related
I have a data like below:
V1 V2
1 orange, apple
2 orange, lemon
3 lemon, apple
4 orange, lemon, apple
5 lemon
6 apple
7 orange
8 lemon, apple
I want to split the V2 variable like this:
I have three categories of the V2 column: "orange", "lemon", "apple"
for each of the categories I want to create a new column (variable) that will inform about whether such a name appeared in V2 (0,1)
I tried this
df %>% separate(V2, into = c("orange", "lemon", "apple"))
.. and I got this result, but it's not what I expect.
V1 orange lemon apple
1 1 orange apple <NA>
2 2 orange lemon <NA>
3 3 lemon apple <NA>
4 4 orange lemon apple
5 5 lemon <NA> <NA>
6 6 apple <NA> <NA>
7 7 orange <NA> <NA>
8 8 lemon apple <NA>
The result I mean is below.
V1 orange lemon apple
1 1 0 1
2 1 1 0
3 0 1 1
4 1 1 0
5 0 1 0
6 0 0 1
7 1 0 0
8 0 1 1
you could try pivoting:
library(dplyr)
library(tidyr)
df |>
separate_rows(V2, sep = ", ") |>
mutate(ind = 1) |>
pivot_wider(names_from = V2,
values_from = ind,
values_fill = 0)
Output is:
# A tibble: 8 × 4
V1 orange apple lemon
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1 0 1
3 3 0 1 1
4 4 1 1 1
5 5 0 0 1
6 6 0 1 0
7 7 1 0 0
8 8 0 1 1
data I used:
V1 <- 1:8
V2 <- c("orange, apple", "orange, lemon",
"lemon, apple", "orange, lemon, apple",
"lemon", "apple", "orange",
"lemon, apple")
df <- tibble(V1, V2)
We may use dummy_cols
library(stringr)
library(fastDummies)
library(dplyr)
dummy_cols(df, "V2", split = ",\\s+", remove_selected_columns = TRUE) %>%
rename_with(~ str_remove(.x, '.*_'))
-output
# A tibble: 8 × 4
V1 apple lemon orange
<int> <int> <int> <int>
1 1 1 0 1
2 2 0 1 1
3 3 1 1 0
4 4 1 1 1
5 5 0 1 0
6 6 1 0 0
7 7 0 0 1
8 8 1 1 0
I have a time-series panel dataset that is structured in the following way: There are multiple funds that each own multiple stocks and we have a value column for the stock. As you can see the panel is not balanced. My actual dataset is very large with each fund having at least 500 stocks and different quarters being represented with some having missing quarter values.
df <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2),
stock_id = c(1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,3,3,3,3),
year_q = c("2011-03","2011-06","2011-09","2011-12","2012-03","2012-06","2011-12","2012-03","2012-06","2012-09",
"2012-12","2013-03","2013-06","2014-09","2015-03","2013-03","2013-06","2013-09","2013-12"),
value = c(1,2,1,3,4,2,1,2,3,4,2,1,3,1,1,3,2,3,1)
)
> df
fund_id stock_id year_q value
1 1 1 2011-03 1
2 1 1 2011-06 2
3 1 1 2011-09 1
4 1 1 2011-12 3
5 1 1 2012-03 4
6 1 1 2012-06 2
7 1 2 2011-12 1
8 1 2 2012-03 2
9 1 2 2012-06 3
10 1 2 2012-09 4
11 1 2 2012-12 2
12 1 2 2013-03 1
13 1 2 2013-06 3
14 2 1 2014-09 1
15 2 1 2015-03 1
16 2 3 2013-03 3
17 2 3 2013-06 2
18 2 3 2013-09 3
19 2 3 2013-12 1
I would like to calculate for each fund, the percentage of stocks held in the current quarter that were ever held in the previous one to 3 quarters. So basically for every fund and every date, I would like to have 3 columns with past 1 Q, past 2Q and past 3Q which show what percentage of stocks held on that date were also present in each of those past quarters.
Here is what the result should look like:
result <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
year_q = c("2011-03","2011-06","2011-09","2011-12","2012-03","2012-06","2012-09","2012-12","2013-03","2013-06",
"2013-03","2013-06","2013-09","2013-12","2014-03","2014-06","2014-09","2014-12","2015-03"),
past_1Q = c("NA",1,1,0.5,1,1,0.5,1,1,1,"NA",1,1,1,"NA","NA","NA","NA","NA"),
past_2Q = c("NA","NA",1,0.5,0.5,1,0.5,0.5,1,1,"NA","NA",1,1,"NA","NA","NA","NA","NA"),
past_3Q = c("NA","NA","NA",0.5,0.5,0.5,0.5,0.5,0.5,1,"NA","NA","NA",1,"NA","NA","NA","NA","NA")
)
> result
fund_id year_q past_1Q past_2Q past_3Q
1 1 2011-03 NA NA NA
2 1 2011-06 1 NA NA
3 1 2011-09 1 1 NA
4 1 2011-12 0.5 0.5 0.5
5 1 2012-03 1 0.5 0.5
6 1 2012-06 1 1 0.5
7 1 2012-09 0.5 0.5 0.5
8 1 2012-12 1 0.5 0.5
9 1 2013-03 1 1 0.5
10 1 2013-06 1 1 1
11 2 2013-03 NA NA NA
12 2 2013-06 1 NA NA
13 2 2013-09 1 1 NA
14 2 2013-12 1 1 1
15 2 2014-03 NA NA NA
16 2 2014-06 NA NA NA
17 2 2014-09 NA NA NA
18 2 2014-12 NA NA NA
19 2 2015-03 NA NA NA
I tried to do this using rollapply but can't get the correct results.
I understand that this might not be the best sample data but in my real data each fund usually have more than 500 stocks and I expect the percentage of matching stocks from one period to the past periods to be something around 0.95 on average.
This is what I have to get the first two result columns (credits to #r2evans):
result <- df %>%
group_by(fund_id) %>%
mutate(miny = min(year_q), maxy = max(year_q)) %>%
distinct(fund_id, miny, maxy) %>%
group_by(fund_id) %>%
mutate(across(c(miny, maxy), ~ as.Date(paste0(., "-01")))) %>%
transmute(year_q = purrr::map2(miny, maxy, ~ format(seq(.x, .y, by = "3 months"), format = "%Y-%m"))) %>%
tidyr::unnest(year_q) %>%
full_join(df, by = c("fund_id", "year_q")) %>%
distinct(fund_id, year_q) %>%
arrange(fund_id, year_q)
library(tidyverse)
df %>%
mutate(year_q = as.Date(paste0(year_q, '-01'))) %>%
group_by(fund_id, year_q) %>%
summarise(stock_id = list(unique(stock_id))) %>%
complete(year_q = seq(min(year_q), max(year_q), by = "3 months")) %>%
reduce(.init = ., 1:3, ~ mutate(.x, "past_{.y}Q" := map(1:n(), \(N) unlist(stock_id[pmax(N-.y, 0)])))) %>%
mutate(across(contains("past"), \(past) map2_dbl(stock_id, past, ~ mean(.x %in% .y)) %>% replace_na(0))) %>%
ungroup()
# A tibble: 19 × 6
fund_id year_q stock_id past_1Q past_2Q past_3Q
<dbl> <date> <list> <dbl> <dbl> <dbl>
1 1 2011-03-01 <dbl [1]> 0 0 0
2 1 2011-06-01 <dbl [1]> 1 0 0
3 1 2011-09-01 <dbl [1]> 1 1 0
4 1 2011-12-01 <dbl [2]> 0.5 0.5 0.5
5 1 2012-03-01 <dbl [2]> 1 0.5 0.5
6 1 2012-06-01 <dbl [2]> 1 1 0.5
7 1 2012-09-01 <dbl [1]> 1 1 1
8 1 2012-12-01 <dbl [1]> 1 1 1
9 1 2013-03-01 <dbl [1]> 1 1 1
10 1 2013-06-01 <dbl [1]> 1 1 1
11 2 2013-03-01 <dbl [1]> 0 0 0
12 2 2013-06-01 <dbl [1]> 1 0 0
13 2 2013-09-01 <dbl [1]> 1 1 0
14 2 2013-12-01 <dbl [1]> 1 1 1
15 2 2014-03-01 <NULL> 0 0 0
16 2 2014-06-01 <NULL> 0 0 0
17 2 2014-09-01 <dbl [1]> 0 0 0
18 2 2014-12-01 <NULL> 0 0 0
19 2 2015-03-01 <dbl [1]> 0 1 0
Given your example, I think the code below gets you there. You might need to switch to data.table if you have a lot of records. Note that I use df1, not df. df is a function in R. I used padr::pad to fill in the missing quarters within a fund. So it will only fill in quarters if there is data from at least 1 stock in the fund. It will not add quarters that are in fund 2 to fund 1 as these have nothing to do with fund 1.
edit: Added a group by in the lag function to correctly lag over the stock_id as the arrange puts the NA values for the stock_ids, instead of the desired order.
df1 %>%
mutate(year_q = ymd(paste0(year_q, "-01"))) %>%
group_by(fund_id) %>%
padr::pad(interval = "3 months") %>%
arrange(fund_id, stock_id, year_q) %>%
mutate(past_1Q = if_else(stock_id == lag(stock_id, order_by = year_q, default = 0), 1, 0),
past_2Q = if_else(stock_id == lag(stock_id, n = 2, order_by = year_q, default = 0), 1, 0),
past_3Q = if_else(stock_id == lag(stock_id, n = 3, order_by = year_q, default = 0), 1, 0)) %>%
group_by(year_q, .add = TRUE) %>%
# add number of stocks in the fund in this quarter
mutate(n_stocks = n()) %>%
summarise(past_1Q = sum(past_1Q, na.rm = T) / mean(n_stocks, na.rm = T),
past_2Q = sum(past_2Q, na.rm = T) / mean(n_stocks, na.rm = T),
past_3Q = sum(past_3Q, na.rm = T) / mean(n_stocks, na.rm = T))
# A tibble: 19 × 5
# Groups: fund_id [2]
fund_id year_q past_1Q past_2Q past_3Q
<dbl> <date> <dbl> <dbl> <dbl>
1 1 2011-03-01 0 0 0
2 1 2011-06-01 1 0 0
3 1 2011-09-01 1 1 0
4 1 2011-12-01 0.5 0.5 0.5
5 1 2012-03-01 0 1 0.5
6 1 2012-06-01 0 1 0
7 1 2012-09-01 1 0 1
8 1 2012-12-01 1 1 0
9 1 2013-03-01 1 1 1
10 1 2013-06-01 1 1 1
11 2 2013-03-01 0 0 0
12 2 2013-06-01 1 0 0
13 2 2013-09-01 1 1 0
14 2 2013-12-01 1 1 1
15 2 2014-03-01 0 0 0
16 2 2014-06-01 0 0 0
17 2 2014-09-01 0 0 0
18 2 2014-12-01 0 0 0
19 2 2015-03-01 0 1 0
I think your results table is incorrect. Looking at 2012-12, there is only one stock live in fund 1. Based on this calculation the outcome should be 100%, not 50%. As 100% of the stocks in the fund now, where also in the fund last quarter, etc. etc.
I have a dataframe as follows:
ID
Col1
RespID
Col3
Col4
Year
Month
Day
1
blue
729Ad
3.2
A
2021
April
2
2
orange
295gS
6.5
A
2021
April
1
3
red
729Ad
8.4
B
2021
April
20
4
yellow
592Jd
2.9
A
2021
March
12
5
green
937sa
3.5
B
2021
May
13
I would like to calculate a new column, Col5, such that its value is 1 if the row has Col4 value of A and there exists another column somewhere in the dataset a row with the same RespId but a Col4 value of B. Otherwise it’s value is 0. Then I will drop all rows with Col4 value of B, to keep just those with A. I'd also like to account for the date fields (year, month, date) so that this is done in groups based on say a 30 day timeframe. So if 'B' appears within 30 days of when 'A' appears in the dataset, only then is there a 1 present (if 'B' appears within 60 days, then there is no 1. Additionally, I'd like to keep everything as data.frames.
Here is what the desired output table would look like prior to dropping rows with Col4 value of B:
ID
Col1
RespID
Col3
Col4
Col5
1
blue
729Ad
3.2
A
1
2
orange
295gS
6.5
A
0
3
red
729Ad
8.4
B
0
4
yellow
592Jd
2.9
A
0
5
green
937sa
3.5
B
0
I have found Ronak's solution in this thread (Calculated Column Based on Rows in Tidymodels Recipe) to be useful, however, would like to modify for the date range.
A lot of things to unpack here.
I think you're tripping up over your own feet by trying to do too many things at once. I've broken down the code into four distinct steps to make the thought process easy to follow. Obviously, for use in a production environment it should be rewritten more efficiently.
1. Generate some data
library(tidyverse)
set.seed(42)
df <- tibble(
id = c(1:10),
resp_id = c(1701, seq(2286, 2289), 1701, seq(2290, 2293)),
grouping = sample(c("A", "B"), size = 10, replace = TRUE),
date = seq.Date(as.Date("2363-10-04"), as.Date("2363-11-17"), length.out = 10)
)
Resulting data:
# A tibble: 10 × 4
id resp_id grouping date
<int> <dbl> <chr> <date>
1 1 1701 A 2363-10-04
2 2 2286 A 2363-10-08
3 3 2287 A 2363-10-13
4 4 2288 A 2363-10-18
5 5 2289 B 2363-10-23
6 6 1701 B 2363-10-28
7 7 2290 B 2363-11-02
8 8 2291 B 2363-11-07
9 9 2292 A 2363-11-12
10 10 2293 B 2363-11-17
2. Check grouping
df <- df %>%
mutate(
is_a = ifelse(grouping == "A", 1, 0),
is_b = ifelse(grouping == "B", 1, 0)
)
We have the grouping now as easy-to-use dummy variables:
> df
# A tibble: 10 × 6
id resp_id grouping date is_a is_b
<int> <dbl> <chr> <date> <dbl> <dbl>
1 1 1701 A 2363-10-04 1 0
2 2 2286 A 2363-10-08 1 0
3 3 2287 A 2363-10-13 1 0
4 4 2288 A 2363-10-18 1 0
5 5 2289 B 2363-10-23 0 1
6 6 1701 B 2363-10-28 0 1
7 7 2290 B 2363-11-02 0 1
8 8 2291 B 2363-11-07 0 1
9 9 2292 A 2363-11-12 1 0
10 10 2293 B 2363-11-17 0 1
3. Check completeness
df <- df %>%
group_by(
resp_id
) %>%
mutate(
# Check if the grouping has both "A" and "B" values
is_complete = ifelse(
sum(is_a) > 0 & sum(is_b) > 0,
1,
0
)
) %>%
ungroup()
We see that there is only one resp_id value that is complete — 1701:
> df
# A tibble: 10 × 7
id resp_id grouping date is_a is_b is_complete
<int> <dbl> <chr> <date> <dbl> <dbl> <dbl>
1 1 1701 A 2363-10-04 1 0 1
2 2 2286 A 2363-10-08 1 0 0
3 3 2287 A 2363-10-13 1 0 0
4 4 2288 A 2363-10-18 1 0 0
5 5 2289 B 2363-10-23 0 1 0
6 6 1701 B 2363-10-28 0 1 1
7 7 2290 B 2363-11-02 0 1 0
8 8 2291 B 2363-11-07 0 1 0
9 9 2292 A 2363-11-12 1 0 0
10 10 2293 B 2363-11-17 0 1 0
4. Assign target value
df <- df %>%
group_by(
resp_id
) %>%
mutate(
# Check if the "A" part of a complete grouping has a another value within 30 days
is_within_timeframe = ifelse(
is_complete == 1 & is_a == 1 & max(date) - min(date) <= 30,
1,
0
)
) %>%
ungroup()
We see that our one complete set has in fact a B value that falls within 30 days of the A observation (Caveat: This only works if there are always exactly one or two observations per grouping!). Column is_within_timeframe corresponds to your Col4:
> df
# A tibble: 10 × 8
id resp_id grouping date is_a is_b is_complete is_within_timeframe
<int> <dbl> <chr> <date> <dbl> <dbl> <dbl> <dbl>
1 1 1701 A 2363-10-04 1 0 1 1
2 2 2286 A 2363-10-08 1 0 0 0
3 3 2287 A 2363-10-13 1 0 0 0
4 4 2288 A 2363-10-18 1 0 0 0
5 5 2289 B 2363-10-23 0 1 0 0
6 6 1701 B 2363-10-28 0 1 1 0
7 7 2290 B 2363-11-02 0 1 0 0
8 8 2291 B 2363-11-07 0 1 0 0
9 9 2292 A 2363-11-12 1 0 0 0
10 10 2293 B 2363-11-17 0 1 0 0
there are only 2 farms, but tons of fruit. trying to see which farm has been performing better over 3 years where the performance is simply farmi / (farm1 + farm2), so for the fruit==peach farm1 performance was 20% vs. farm2 80%
sample data:
df <- data.frame(fruit = c("apple", "apple", "peach", "peach", "pear", "pear", "lime", "lime"),
farm = as.factor(c(1,2,1,2,1,2,1,2)), 'y2019' = c(0,0,3,12,0,7,4,6),
'y2018' = c(5,3,0,0,8,2,0,0),'y2017' = c(4,5,7,15,0,0,0,0) )
> df
fruit farm y2019 y2018 y2017
1 apple 1 0 5 4
2 apple 2 0 3 5
3 peach 1 3 0 7
4 peach 2 12 0 15
5 pear 1 0 8 0
6 pear 2 7 2 0
7 lime 1 4 0 0
8 lime 2 6 0 0
>
desired output:
out
fruit farm y2019 y2018 y2017
1 apple 1 0.0 0.625 0.444444
2 apple 2 0.0 0.375 0.555556
3 peach 1 0.2 0.000 0.318818
4 peach 2 0.8 0.000 0.681818
5 pear 1 0.0 0.800 0.000000
6 pear 2 1.0 0.200 0.000000
7 lime 1 0.4 0.000 0.000000
8 lime 2 0.6 0.000 0.000000
>
this is a far as i could go:
df %>%
group_by(fruit) %>%
summarise(across(where(is.numeric), sum))
We can group by 'fruit', mutate across the columns that starts with 'y' to divide the elements by the sum of the values in those columns and if all values are 0, then return 0
library(dplyr)
df %>%
group_by(fruit) %>%
mutate(across(starts_with('y'), ~ if(all(. == 0)) 0 else ./sum(.)))
# A tibble: 8 x 5
# Groups: fruit [4]
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 0 0.625 0.444
#2 apple 2 0 0.375 0.556
#3 peach 1 0.2 0 0.318
#4 peach 2 0.8 0 0.682
#5 pear 1 0 0.8 0
#6 pear 2 1 0.2 0
#7 lime 1 0.4 0 0
#8 lime 2 0.6 0 0
NOTE: Here, we just used dplyr package and it is done in a single step
Or another option is adorn_percentages from janitor
library(janitor)
library(purrr)
df %>%
group_split(fruit) %>%
map_dfr(adorn_percentages, denominator = "col") %>%
as_tibble
Or using data.table
library(data.table)
setDT(df)[, (3:5) := lapply(.SD, function(x) if(all(x == 0)) 0
else x/sum(x, na.rm = TRUE)), .SDcols = 3:5, by = fruit][]
Or using base R
grpSums <- rowsum(df[3:5], df$fruit)
df[3:5] <- df[3:5]/grpSums[match(df$fruit, row.names(grpSums)),]
We can use prop.table to calculate the proportions for each fruit.
library(dplyr)
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric), prop.table),
#to replace `NaN` with 0
across(where(is.numeric), tidyr::replace_na, 0))
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 0 0.625 0.444
#2 apple 2 0 0.375 0.556
#3 peach 1 0.2 0 0.318
#4 peach 2 0.8 0 0.682
#5 pear 1 0 0.8 0
#6 pear 2 1 0.2 0
#7 lime 1 0.4 0 0
#8 lime 2 0.6 0 0
data1=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Total_A_Grade5"=c(22,20,21,24,24,26,25,22),
"Group1_A_Grade5"=c(10,6,6,10,9,9,9,10),
"Group2_A_Grade5"=c(5,9,9,8,10,8,8,6),
"Total_B_Grade5"=c(23,33,19,21,19,23,20,21),
"Group1_B_Grade5"=c(8,7,7,10,9,9,5,5),
"Group2_B_Grade5"=c(6,10,7,6,6,5,9,9),
"Total_A_Grade6"=c(18,24,16,24,26,25,16,19),
"Group1_A_Grade6"=c(7,7,5,9,10,9,5,7),
"Group2_A_Grade6"=c(5,8,6,7,10,8,8,9),
"Total_B_Grade6"=c(26,23,22,24,21,22,24,19),
"Group1_B_Grade6"=c(10,10,6,10,7,8,8,7),
"Group2_B_Grade6"=c(9,6,9,6,7,6,9,9),
"Total_A_Grade7"=c(20,19,18,25,16,21,19,26),
"Group1_A_Grade7"=c(9,7,7,9,7,7,5,8),
"Group2_A_Grade7"=c(8,5,7,9,6,5,5,9),
"Total_B_Grade7"=c(25,21,24,25,18,18,27,18),
"Group1_B_Grade7"=c(10,10,10,7,5,6,8,5),
"Group2_B_Grade7"=c(9,6,8,10,8,6,10,6))
data2=data.frame("School"=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
"Fund"=c(0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1),
"Type"=c('Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2'),
"Class"=c('A','A','A','A','A','A','B','B','B','B','B','B','A','A','A','A','A','A','B','B','B','B','B','B'),
"Grade"=c(5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6),
"Score"=c(22,20,10,6,5,9,23,33,8,7,6,10,18,24,7,7,5,8,26,23,10,10,9,6))
I have 'data1' and want to reshape to make 'data2' which just shows example for School 1 grade 5 and 6 but I want all of data1 reshaped.
The column names of 'data1' contain rich information. For example, Group2_B_Grade6 indicated 'Type' = Group2, 'Class' = B, 'Grade' = 6. I wish to reshape 'data1' and then use these stubs separated by "_" as colnames to prepare 'data2'
data3=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Grade_5"=c(22,20,21,24,24,26,25,22),
"Grade_6"=c(10,6,6,10,9,9,9,10),
"Grade_7"=c(5,9,9,8,10,8,8,6))
You can do this directly with pivot_longer with some regex in names_pattern.
tidyr::pivot_longer(data1,
cols = -c(School, Fund),
names_to = c('Type', 'Class', 'Grade'),
names_pattern = '(.*?)_([A-Z])_Grade(\\d+)',
values_to = 'Score')
# A tibble: 144 x 6
# School Fund Type Class Grade Score
# <dbl> <dbl> <chr> <chr> <chr> <dbl>
# 1 1 0 Total A 5 22
# 2 1 0 Group1 A 5 10
# 3 1 0 Group2 A 5 5
# 4 1 0 Total B 5 23
# 5 1 0 Group1 B 5 8
# 6 1 0 Group2 B 5 6
# 7 1 0 Total A 6 18
# 8 1 0 Group1 A 6 7
# 9 1 0 Group2 A 6 5
#10 1 0 Total B 6 26
# … with 134 more rows
Using dplyr (and tidyr):
library(dplyr)
library(tidyr)
data2 <- data1 %>%
pivot_longer(-c(School, Fund)) %>%
separate(name, into = c('Type', 'Class', 'Grade')) %>%
extract(Grade, 'Grade', "([0-9]+)")
data2
#> # A tibble: 144 x 6
#> School Fund Type Class Grade value
#> <dbl> <dbl> <chr> <chr> <chr> <dbl>
#> 1 1 0 Total A 5 22
#> 2 1 0 Group1 A 5 10
#> 3 1 0 Group2 A 5 5
#> 4 1 0 Total B 5 23
#> 5 1 0 Group1 B 5 8
#> 6 1 0 Group2 B 5 6
#> 7 1 0 Total A 6 18
#> 8 1 0 Group1 A 6 7
#> 9 1 0 Group2 A 6 5
#> 10 1 0 Total B 6 26
#> # … with 134 more rows
Created on 2020-04-06 by the reprex package (v0.3.0)
We can use melt from data.table
library(data.table)
melt(setDT(data1), id.var = c('School', 'Fund'))[,
c('Type', 'Class', 'Grade') := tstrsplit(variable, "_")][,
Grade := sub('Grade', '', Grade)][, variable := NULL][]
# School Fund value Type Class Grade
# 1: 1 0 22 Total A 5
# 2: 1 1 20 Total A 5
# 3: 2 0 21 Total A 5
# 4: 2 1 24 Total A 5
# 5: 3 0 24 Total A 5
# ---
#140: 2 1 10 Group2 B 7
#141: 3 0 8 Group2 B 7
#142: 3 1 6 Group2 B 7
#143: 4 0 10 Group2 B 7
#144: 4 1 6 Group2 B 7