Looking at similar questions, I could not find one that matched my need.
If one does contain a solution, please share its link.
I have this dput-produced data:
structure(list(Player = c("Seth Lugo", "Jacob deGrom", "Rick Porcello",
"David Peterson", "Michael Wacha", "Seth Lugo", "Jacob deGrom",
"Rick Porcello", "David Peterson", "Steven Matz", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "David Peterson", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "Michael Wacha", "David Peterson",
"Jacob deGrom", "Seth Lugo", "Rick Porcello", "Robert Gsellman",
"Michael Wacha", "Ariel Jurado", "Jacob deGrom", "Rick Porcello",
"Seth Lugo", "Robert Gsellman", "David Peterson"), Date = structure(c(1601164800,
1601078400, 1601078400, 1600905600, 1600819200, 1600732800, 1600646400,
1600560000, 1600473600, 1600387200, 1600300800, 1600214400, 1600128000,
1599955200, 1599868800, 1599782400, 1599609600, 1599523200, 1599436800,
1599350400, 1599264000, 1599177600, 1599091200, 1599004800, 1598918400,
1598832000, 1598745600, 1598745600, 1598659200, 1598572800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), DblHdr = c(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2), DateStr = c("09/27/2020",
"09/26/2020", "09/26/2020", "09/24/2020", "09/23/2020", "09/22/2020",
"09/21/2020", "09/20/2020", "09/19/2020", "09/18/2020", "09/17/2020",
"09/16/2020", "09/15/2020", "09/13/2020", "09/12/2020", "09/11/2020",
"09/09/2020", "09/08/2020", "09/07/2020", "09/06/2020", "09/05/2020",
"09/04/2020", "09/03/2020", "09/02/2020", "09/01/2020", "08/31/2020",
"08/30/2020", "08/30/2020", "08/29/2020", "08/28/2020"), Month = c("09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "08", "08", "08", "08", "08"), Tm = c("NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM"), Opp = c("WSN", "WSN", "WSN", "WSN", "TBR", "TBR", "TBR",
"ATL", "ATL", "ATL", "PHI", "PHI", "PHI", "TOR", "TOR", "TOR",
"BAL", "BAL", "PHI", "PHI", "PHI", "PHI", "NYY", "BAL", "BAL",
"MIA", "NYY", "NYY", "NYY", "NYY"), Rslt = c("L 5-15", "L 3-4",
"L 3-5", "W 3-2", "L 5-8", "W 5-2", "L 1-2", "L 0-7", "W 7-2",
"L 2-15", "W 10-6", "W 5-4", "L 1-4", "L 3-7", "L 2-3", "W 18-1",
"W 7-6", "L 2-11", "L 8-9", "W 14-1", "W 5-1", "L 3-5", "W 9-7",
"W 9-4", "L 5-9", "L 3-5", "L 7-8", "L 2-5", "L 1-2", "W 4-3"
), W_L = c("L", "L", "L", "W", "L", "W", "L", "L", "W", "L",
"W", "W", "L", "L", "L", "W", "W", "L", "L", "W", "W", "L", "W",
"W", "L", "L", "L", "L", "L", "W"), temp = c("L 5", "L 3", "L 3",
"W 3", "L 5", "W 5", "L 1", "L 0", "W 7", "L 2", "W 10", "W 5",
"L 1", "L 3", "L 2", "W 18", "W 7", "L 2", "L 8", "W 14", "W 5",
"L 3", "W 9", "W 9", "L 5", "L 3", "L 7", "L 2", "L 1", "W 4"
), RS = c(5, 3, 3, 3, 5, 5, 1, 0, 7, 2, 10, 5, 1, 3, 2, 18, 7,
2, 8, 14, 5, 3, 9, 9, 5, 3, 7, 2, 1, 4), RA = c(15, 4, 5, 2,
8, 2, 2, 7, 2, 15, 6, 4, 4, 7, 3, 1, 6, 11, 9, 1, 1, 5, 7, 4,
9, 5, 8, 5, 2, 3), Rdiff = c(-10, -1, -2, 1, -3, 3, -1, -7, 5,
-13, 4, 1, -3, -4, -1, 17, 1, -9, -1, 13, 4, -2, 2, 5, -4, -2,
-1, -3, -1, 1), absV = c(10, 1, 2, 1, 3, 3, 1, 7, 5, 13, 4, 1,
3, 4, 1, 17, 1, 9, 1, 13, 4, 2, 2, 5, 4, 2, 1, 3, 1, 1), App_Dec = c("GS-2, L",
"GS-5", "GS-3, L", "GS-7, W", "GS-6, L", "GS-7, W", "GS-7, L",
"GS-7, L", "GS-6, W", "GS-3, L", "GS-2", "GS-2", "GS-6, L", "GS-5, L",
"GS-6, L", "GS-6, W", "GS-4", "GS-4, L", "GS-2", "GS-7, W", "GS-5, W",
"GS-6", "GS-2", "GS-3", "GS-4", "GS-6, L", "GS-5", "GS-4", "GS-4",
"GS-4"), IP = c(1.1, 5, 3, 7, 6, 6.1, 7, 7, 6, 2.2, 1.2, 2, 6,
5, 5.1, 6, 4, 4, 2, 7, 5, 6, 1.2, 3, 4, 6, 5, 3.2, 4, 4), H = c(5,
5, 8, 4, 6, 4, 4, 3, 3, 8, 8, 4, 6, 3, 7, 3, 10, 7, 3, 3, 4,
3, 4, 4, 9, 6, 4, 4, 4, 4), R = c(6, 3, 5, 1, 4, 2, 2, 1, 1,
6, 6, 3, 4, 2, 3, 1, 5, 5, 5, 1, 1, 2, 4, 2, 5, 4, 2, 1, 1, 3
), ER = c(6, 3, 3, 1, 4, 1, 2, 1, 1, 6, 6, 3, 4, 2, 3, 1, 5,
4, 5, 1, 1, 2, 4, 2, 5, 1, 2, 1, 1, 3), BB = c(2, 2, 1, 1, 0,
1, 2, 2, 4, 3, 0, 1, 2, 2, 1, 2, 0, 0, 4, 2, 2, 2, 4, 1, 0, 2,
2, 2, 0, 3), SO = c(1, 10, 3, 4, 4, 7, 14, 10, 10, 5, 3, 1, 5,
2, 5, 9, 3, 3, 3, 12, 8, 6, 0, 2, 2, 9, 2, 7, 4, 3), HR = c(0,
2, 1, 0, 2, 1, 1, 1, 1, 2, 4, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0,
0, 0, 1, 1, 0, 1, 1, 0), UER = c(0, 0, 2, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0),
Pit = c(38, 113, 67, 107, 66, 95, 112, 100, 102, 76, 52,
40, 94, 81, 91, 102, 66, 71, 70, 108, 81, 100, 52, 69, 84,
103, 86, 60, 57, 70), Str = c(24, 78, 42, 68, 45, 66, 70,
70, 62, 45, 30, 25, 66, 52, 60, 68, 45, 49, 37, 74, 50, 65,
22, 41, 53, 72, 55, 39, 33, 37), GSc = c(19, 53, 29, 68,
48, 65, 73, 75, 68, 20, 18, 36, 47, 53, 46, 69, 25, 33, 29,
77, 61, 62, 27, 44, 26, 57, 51, 54, 54, 42), BF = c(12, 22,
19, 26, 23, 24, 26, 26, 24, 18, 14, 11, 26, 20, 24, 23, 21,
20, 14, 26, 21, 23, 13, 15, 21, 27, 20, 16, 15, 18), AB = c(8,
20, 18, 24, 23, 23, 23, 23, 20, 15, 13, 9, 24, 18, 22, 21,
21, 20, 9, 24, 19, 21, 8, 13, 20, 25, 18, 14, 15, 15), H2B = c(2,
0, 1, 1, 1, 0, 2, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1,
0, 0, 1, 0, 2, 2, 2, 0, 1, 0), H3B = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 0), IBB = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
HBP = c(1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), SH = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0), SF = c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0), GDP = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1), SB = c(0, 1,
1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0,
1, 0, 0, 0, 3, 0, 0, 0, 0), CS = c(0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), PO = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BK = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), WP = c(0, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0), ERA = c("40.5", "5.4", "9", "1.29", "6", "1.42",
"2.57", "1.29", "1.5", "20.25", "32.4", "13.5", "6", "3.6",
"5.0599999999999996", "1.5", "11.25", "9", "22.5", "1.29",
"1.8", "3", "21.6", "6", "11.25", "1.5", "3.6", "2.4500000000000002",
"2.25", "6.75"), WPA = c(-0.471, -0.087, -0.256, 0.34, -0.22,
0.18, 0.107, 0.219, 0.229, -0.358, -0.487, -0.186, -0.156,
0.036, -0.047, 0.049, -0.329, -0.321, -0.34, 0.193, 0.156,
0.07, -0.312, -0.042, -0.278, -0.271, 0.029, 0.02, 0.092,
-0.174), RE24 = c(-5.122, -0.193, -3.316, 2.931, -1.08, 1.509,
1.406, 2.406, 1.92, -4.641, -5.444, -1.919, -0.758, 0.679,
0.245, 2.215, -3.054, -3.054, -4.027, 2.406, 1.433, 0.92,
-3.788, -0.359, -2.812, -1.08, 0.707, 0.364, 1.166, -0.834
), aLI = c(1.45, 1.244, 0.974, 1.271, 0.965, 0.921, 0.955,
0.888, 1.066, 0.962, 0.767, 1.073, 0.941, 0.852, 1.353, 0.392,
0.857, 0.805, 0.904, 0.75, 1.037, 0.861, 1.232, 1.355, 0.914,
1.239, 1.213, 1.28, 0.748, 1.407)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))
Desired output:
The numbers starting in the second column are the total absV values for each player for each column. The last column contains the sum of all the absV values for each player where absV > 5. Only a sample of the first 3 rows are shown, and the absV values are just filler numbers.
| Player | 1 | 2 | 3 | 4 | 5 | >5 |
| deGrom | 2 | 3 | 5 | 0 | 1 | 3 |
| Matz | 2 | 3 | 5 | 0 | 1 | 3 |
Code tried (I need help getting beyond the point shown). I would prefer if the code uses dplyr:
starter %>%
select(Player, absV) %>%
group_by(Player, absV) %>%
summarize(numG= n()) %>%
arrange(Player,absV)
To do this you to bifurcate your data with rows per player >5 and <=5, then rbind them together and thereafter pivot_wider. Follow this code
library(dplyr)
library(tidyr)
df <- starter %>% group_by(Player) %>%
mutate(row = row_number()) %>%
select(Player, absV, row) %>% arrange(Player)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 5 4 1 1 NA
3 Jacob deGrom 1 1 1 17 13 2
4 Michael Wacha 3 9 5 NA NA NA
5 Rick Porcello 2 7 3 1 2 1
6 Robert Gsellman 2 1 NA NA NA NA
7 Seth Lugo 10 3 4 1 4 3
8 Steven Matz 13 NA NA NA NA NA
Note. Loading tidyverse package, at once, directly is advised.
Note-2 If you still want to sort absV before changing the data-format, add absV in arrange syntax beforehand joining them..
df <- starter %>% group_by(Player) %>%
arrange(Player, absV) %>%
mutate(row = row_number()) %>%
select(Player, absV, row)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
#this will give the following diff output
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 1 1 4 5 NA
3 Jacob deGrom 1 1 1 2 13 17
4 Michael Wacha 3 5 9 NA NA NA
5 Rick Porcello 1 1 2 2 3 7
6 Robert Gsellman 1 2 NA NA NA NA
7 Seth Lugo 1 3 3 4 4 10
8 Steven Matz 13 NA NA NA NA NA
Additional Question in comments below
Follow this code to work out frequency of each absV
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup()
#check it
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup() %>% select(Player, absV, freq)
Player absV freq
<chr> <dbl> <int>
1 Seth Lugo 10 1
2 Jacob deGrom 1 3
3 Rick Porcello 2 2
4 David Peterson 1 3
5 Michael Wacha 3 1
6 Seth Lugo 3 2
7 Jacob deGrom 1 3
8 Rick Porcello 7 1
9 David Peterson 5 1
10 Steven Matz 13 1
# ... with 20 more rows
Using data.table
library(data.table)
dcast(setDT(starter), Player ~ rowid(Player), value.var = 'absV')
I have two data frames:
> df1
2013-04-1 2013-04-2 2013-04-3 2013-04-4 2013-04-5 2013-04-6 2013-04-7 2013-04-8 2013-04-9 2013-04-10 2013-04-11
bin_1 32 489 32 32 364 19 312 0 0 0 346
bin_2 8 346 8 0 98 8 12 12 46 364 346
bin_3 9 98 346 46 9 312 6 1912 0 489 0
bin_4 4 12 9 12 0 12 0 987 9 19 12
bin_5 0 0 8 8 0 0 312 6 312 12 4
df1 contains 5 rows (bins) and 23 columns (date)
> df2
orange apple pear banana watermelon lemon
2013-04-1 1 1 1 1 0 1
2013-04-2 1 1 0 1 0 0
2013-04-3 1 1 1 1 0 1
2013-04-4 0 1 0 1 1 1
2013-04-5 1 0 0 0 1 1
df2 contains 23 rows(date) and 6 columns (types of fruits)
So now, I want to concentrate these 2 dfs into 1 big data frame that contains all the information, like:
> df3
orange apple pear banana watermelon lemon
bin_1 ? ? ? ? ? ?
bin_2 ? ? ? ? ? ?
bin_3 ? ? ? ? ? ?
bin_4 ? ? ? ? ? ?
bin_5 ? ? ? ? ? ?
But how can i concentrate the data? So for example,
on 2013-04-1,
bin_1 contains 32 fruits, bin_2 contains 8 fruits, ..., bin_5 contains 0 fruits (based on df1)
only orange, apple, pear, banana, and lemon are available (based on df2)
Q. I want my df3 to contain concentrate information, like bin_1 on average contain x amount of oranges, ...etc .How can I model this?
Code:
> dput(df1)
structure(list(`2013-04-1` = c(32, 8, 9, 4, 0), `2013-04-2` = c(489,
346, 98, 12, 0), `2013-04-3` = c(32, 8, 346, 9, 8), `2013-04-4` = c(32,
0, 46, 12, 8), `2013-04-5` = c(364, 98, 9, 0, 0), `2013-04-6` = c(19,
8, 312, 12, 0), `2013-04-7` = c(312, 12, 6, 0, 312), `2013-04-8` = c(0,
12, 1912, 987, 6), `2013-04-9` = c(0, 46, 0, 9, 312), `2013-04-10` = c(0,
364, 489, 19, 12), `2013-04-11` = c(346, 346, 0, 12, 4), `2013-04-12` = c(0,
9, 12, 46, 489), `2013-04-13` = c(32, 8, 19, 46, 0), `2013-04-14` = c(0,
987, 12, 0, 6), `2013-04-15` = c(0, 346, 4, 346, 0), `2013-04-16` = c(0,
1912, 1912, 12, 364), `2013-04-17` = c(12, 98, 32, 32, 1912),
`2013-04-18` = c(12, 12, 12, 0, 346), `2013-04-19` = c(9,
46, 98, 312, 4), `2013-04-20` = c(32, 987, 46, 9, 312), `2013-04-21` = c(4,
98, 12, 32, 12), `2013-04-22` = c(19, 0, 4, 346, 0), `2013-04-23` = c(1912,
364, 0, 0, 489)), row.names = c("bin_1", "bin_2", "bin_3",
"bin_4", "bin_5"), class = "data.frame")
> dput(df2)
structure(list(orange = c(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0), apple = c(1, 1, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), pear = c(1,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
0), banana = c(1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0), watermelon = c(0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), lemon = c(1, 0,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0
)), row.names = c("2013-04-1", "2013-04-2", "2013-04-3", "2013-04-4",
"2013-04-5", "2013-04-6", "2013-04-7", "2013-04-8", "2013-04-9",
"2013-04-10", "2013-04-11", "2013-04-12", "2013-04-13", "2013-04-14",
"2013-04-15", "2013-04-16", "2013-04-17", "2013-04-18", "2013-04-19",
"2013-04-20", "2013-04-21", "2013-04-22", "2013-04-23"), class = "data.frame")
I have a dataset as follows:
structure(list(chr = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
1, 0, 0, 1, 1, 1, 1), leftPos = c(240000, 1080000, 1200000, 1320000,
1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 3480000,
3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 5160000,
5280000, 6e+06), chr.1 = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 1, 1, 1), leftPos.1 = c(240000, 1080000, 1200000,
1320000, 1440000, 1800000, 2400000, 2520000, 3120000, 3360000,
3480000, 3600000, 3720000, 4200000, 4560000, 4920000, 5040000,
5160000, 5280000, 6e+06), ASample = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0), Sample1 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1), Sample2 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), Sample3 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1), Sample4 = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample5 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample6 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample7 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Sample8 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample9 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1), Sample10 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample11 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample12 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample13 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), Sample14 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample15 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1)), .Names = c("chr",
"leftPos", "chr.1", "leftPos.1", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15"), row.names = c(NA,
20L), class = "data.frame")
I need to count the number of rows per column where there is more than one of the same 1 or -1
I would like to be able to count the number of consecutive rows for each column, grouped by chr, that have three consecutive 1 or -1 within a chromosome (column called chr).
The ideal output would be something like (not taken from the dput data above)
chr numberOfConsecutive1s FreqSample1 FreqSample2 FreqSample3 etc
1 2 3 2 14
1 3 5 2 2
1 4 5 0 6
1 5 4 3 5
1 6 3 0 3
1 7 7 5 7
1 8 5 0 2
1 9 54 2 6
1 10 34 77 7
2 2 6 4 2
2 3 23 34 34
2 4 5 37 2
2 5 55 24 22
2 6 2 0 11
2 7 3 14 5
2 8 2 5 77
2 9 5 23 34
2 10 5 11 34
3 1 32 0 2
So far I have tried the following which simply converts non consecutive 1s into 0s so I am left with the consecutive 1s only. I don't know how to count them up as per the desired output.
dx<-DAT_list2res
f0 <- function( colNr, dx )
{
col <- dx[,colNr]
n1 <- which( col == 1 ) # The `1`-rows.
d0 <- which( diff(col) == 0 ) # Consecutive entries are equal.
dc0 <- which( diff(dx[,1]) == 0 ) # Same chromosome.
m <- intersect( n1-1, intersect( d0, dc0 ) )
return ( setdiff( 1:nrow(dx), union(m,m+1) ) )
}
g <- function( dx )
{
for ( i in 3:ncol(dx) ) { dx[f0(i,dx),i] <- 0 }
return ( dx )
}
dx<-g(dx)
EDIT
I also tried this as suggested by bramtayl:
result =
consecFreq %>%
select(-chr) %>%
gather(variable, chr, 5:190) %>%
group_by(variable) %>%
mutate(ID =
chr %>%
lag %>%
`!=`(chr) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
count(variable, chr, ID) %>%
rename(numberOfConsecutive1s = n) %>%
count(variable, chr, numberOfConsecutive1s) %>%
spread(variable, n, fill = 0)
but it gives me an 'index out of bounds' error. If I ignore the spread line I get an odd output as well so I'm not sure this is the answer
REVISED
Based on clarifications, this approach uses the rle function for each chromosome to find runs of consecutive 1's or -1's and then table to count the number of runs for each value. This gives NA for samples which have no counts for a particular value so the last line of the code converts the NA's to 0's if this is helpful. Finally there seems to be a problem with your structure input in that Cytospongex10_SLX.9395.FastSeqK.fq.gz.res is missing from the .Names section of structure. This causes all the column names to be shifted and the last column name to be NA which can cause problems in the execution.
The code below assigns the correct names to the input data (in data.frame df) and then calculates the frequencies as described above.
colnames(data) <- c("chr",
"leftPos", "chr.1", "leftPos.1", "Cytospongex10_SLX.9395.FastSeqK.fq.gz.res", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15")
chr_labels <- sort(unique(data$chr))
sampl_freqs <- data.frame(chr=1, numberOfConsecutive1s=1, count=0)
for( sampl in colnames(data)[-(1:5)]) {
freqs <- data.frame()
for( chr in chr_labels ) {
runs <- rle(data[data$chr == chr,sampl])
freqs_chr <- data.frame(chr=chr, table(runs$length[runs$values %in% c(-1,1)], dnn = "numberOfConsecutive1s") )
freqs <- rbind(freqs, freqs_chr)
}
sampl_freqs <- merge.data.frame(sampl_freqs, freqs, by = c("chr","numberOfConsecutive1s"), all=TRUE)
colnames(sampl_freqs) <- c(head(colnames(sampl_freqs),-1),paste("Freq",sampl,sep=""))
}
# clean up from sampl_freqs definition
sampl_freqs <- sampl_freqs[,-3]
# To convert NA's to 0
sampl_freqs <- data.frame(sampl_freqs[,1:2], sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x)))
Similar to above, but uses dplyr
library(reshape2)
library(dplyr)
df <- melt(data[,-(2:5)], id.vars="chr", variable.name="sample")
sampl_freqs <- df %>% group_by(sample, chr ) %>%
do(data.frame(unclass(rle(.$value))) %>%
filter(values %in% c(-1,1)) ) %>%
group_by(sample, chr, lengths) %>%
summarize(Freq = n() ) %>%
dcast( chr + lengths ~ sample, value.var = "Freq" )
sampl_freqs <- with(sampl_freqs,data.frame( chr, numberOfConsecutive1s = lengths ,
sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x))))
I think you want something like this:
library(dplyr)
library(tidyr)
min_chunk_length = 1
result =
data %>%
rename(chromosome = chr) %>%
select(chromosome, Sample1:Sample15) %>%
gather(sample, value, Sample1:Sample15) %>%
group_by(chromosome, sample) %>%
mutate(non_zero = value %in% c(1, -1),
chunk_ID =
non_zero %>%
lag %>%
`!=`(non_zero) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
filter(non_zero = TRUE) %>%
group_by(chromosome, sample, chunk_ID) %>%
mutate(length_of_chunk = n()) %>%
filter(length_of_chunk > min_chunk_length) %>%
count(chromosome, sample) %>%
spread(sample, n, fill = 0)