Collapsing daily longitudinal data into monthly observations by ID in R - r

I have longitudinal data with >100 rows per subject representing daily observations. I want to collapse columns, by subject ID, into monthly observations (i.e. have multiple rows per ID that are summarizing every 30 rows (days) of data).
How can you specify such groupings of days using dplyr?
Also of note, all subjects have different total number of days
Edit: data sample below
df<-structure(list(ID = structure(c(100087, 100087, 100087, 100087,
100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087,
100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087)), time = structure(c(0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)),
BMI = structure(c(20.06, 20.06, 20.06, 20.06, 20.06, 20.06,
20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06,
20.06, 20.06, 20.06, 20.06, 20.06)), Dis = structure(c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
Drug1 = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1)), Drug2 = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1))), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))

I would group_by ID and a new times variable where you can specify the bins with time %/% 30 for 30 rows. Since your example data has only a few rows I set this to 5. Since each respondent has a different amount of times we need to record first_time and last_time to then overwrite times as x - y times where x and y are first and last time.
In the across call you would need to specify the way you want to aggregate the data, below I choose mean. In case you want to get the mean of BMI and the max value of Drug1 you would need to specify each column in a separate function call.
library(dplyr)
df %>%
group_by(ID, times = time %/% 5) %>%
summarise(across(BMI:Drug2, mean),
time_first = first(time),
time_last = last(time)
) %>%
ungroup() %>%
mutate(times = paste0(time_first, "-", time_last)) %>%
select(-c(time_first, time_last))
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 4 × 6
#> ID times BMI Dis Drug1 Drug2
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 100087 0-4 20.1 0 1 1
#> 2 100087 5-9 20.1 0 1 1
#> 3 100087 10-14 20.1 0 1 1
#> 4 100087 15-19 20.1 0 1 1
# OPs data
df <- structure(list(ID = structure(c(100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087, 100087)), time = structure(c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)), BMI = structure(c(20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06, 20.06)), Dis = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), Drug1 = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), Drug2 = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1))), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
Created on 2022-09-27 by the reprex package (v0.3.0)

Related

Count the frequency of concecutive zeros in a every time they appear in a each row

I have this dataframe and would like to compute a count of zero sequences every time they appear in a row so that the output would be A: 2 4, B:1 2 1, C:2 5, D: 2 3, E: 1 1
df <- data.frame(
A=c(1, 0, 0, 1, 1, 0, 0, 0, 0),
B=c(0, 1, 1, 0, 0, 1, 0, 1, 1),
C=c(0, 0, 1, 1, 0, 0, 0, 0, 0),
D=c(0, 0, 1, 1, 1, 1, 0, 0, 0),
E=c(1, 0, 1, 1, 1, 1, 0, 1, 1)
)
We may use rle by looping over the columns of the data.frame and get the lengths of the 0 values in base R
lapply(df1, function(x) with(rle(x), lengths[!values]))
-output
$A
[1] 2 4
$B
[1] 1 2 1
$C
[1] 2 5
$D
[1] 2 3
$E
[1] 1 1
data
df1 <- structure(list(A = c(1, 0, 0, 1, 1, 0, 0, 0, 0), B = c(0, 1,
1, 0, 0, 1, 0, 1, 1), C = c(0, 0, 1, 1, 0, 0, 0, 0, 0), D = c(0,
0, 1, 1, 1, 1, 0, 0, 0), E = c(1, 0, 1, 1, 1, 1, 0, 1, 1)), row.names = c(NA,
-9L), class = "data.frame")

Create a table from the count and percentage of different columns in a DF

I got this df
structure(list(patients = 1:10, adm1 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1), adm2 = c(0, 1, 1, 0, 0, 1, 1, 0, 0, 1), adm3 = c(0,
0, 1, 1, 0, 0, 0, 1, 0, 1), adm4 = c(0, 0, 0, 0, 1, 0, 0, 0,
1, 0)), class = "data.frame", row.names = c(NA, -10L))
I want a table like this
adm1 adm2 adm3 adm4
No of patients 10 5 3 2
Percentage 100% 50% 30% 20%
We can use colSums and colMeans -
rbind(no_of_patients = colSums(df[-1]),
percentage = colMeans(df[-1]) * 100)
#. adm1 adm2 adm3 adm4
#no_of_patients 10 5 4 2
#percentage 100 50 40 20
Try colSums and colMeans
t(
sapply(
c(no_of_patients = colSums, percentage = colMeans),
function(f) f(df[-1])
)
)
which gives
adm1 adm2 adm3 adm4
no_of_patients 10 5.0 4.0 2.0
percentage 1 0.5 0.4 0.2

How can I represent one column's values using multiple columns in R where one new column is conditional?

Looking at similar questions, I could not find one that matched my need.
If one does contain a solution, please share its link.
I have this dput-produced data:
structure(list(Player = c("Seth Lugo", "Jacob deGrom", "Rick Porcello",
"David Peterson", "Michael Wacha", "Seth Lugo", "Jacob deGrom",
"Rick Porcello", "David Peterson", "Steven Matz", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "David Peterson", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "Michael Wacha", "David Peterson",
"Jacob deGrom", "Seth Lugo", "Rick Porcello", "Robert Gsellman",
"Michael Wacha", "Ariel Jurado", "Jacob deGrom", "Rick Porcello",
"Seth Lugo", "Robert Gsellman", "David Peterson"), Date = structure(c(1601164800,
1601078400, 1601078400, 1600905600, 1600819200, 1600732800, 1600646400,
1600560000, 1600473600, 1600387200, 1600300800, 1600214400, 1600128000,
1599955200, 1599868800, 1599782400, 1599609600, 1599523200, 1599436800,
1599350400, 1599264000, 1599177600, 1599091200, 1599004800, 1598918400,
1598832000, 1598745600, 1598745600, 1598659200, 1598572800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), DblHdr = c(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2), DateStr = c("09/27/2020",
"09/26/2020", "09/26/2020", "09/24/2020", "09/23/2020", "09/22/2020",
"09/21/2020", "09/20/2020", "09/19/2020", "09/18/2020", "09/17/2020",
"09/16/2020", "09/15/2020", "09/13/2020", "09/12/2020", "09/11/2020",
"09/09/2020", "09/08/2020", "09/07/2020", "09/06/2020", "09/05/2020",
"09/04/2020", "09/03/2020", "09/02/2020", "09/01/2020", "08/31/2020",
"08/30/2020", "08/30/2020", "08/29/2020", "08/28/2020"), Month = c("09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "08", "08", "08", "08", "08"), Tm = c("NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM"), Opp = c("WSN", "WSN", "WSN", "WSN", "TBR", "TBR", "TBR",
"ATL", "ATL", "ATL", "PHI", "PHI", "PHI", "TOR", "TOR", "TOR",
"BAL", "BAL", "PHI", "PHI", "PHI", "PHI", "NYY", "BAL", "BAL",
"MIA", "NYY", "NYY", "NYY", "NYY"), Rslt = c("L 5-15", "L 3-4",
"L 3-5", "W 3-2", "L 5-8", "W 5-2", "L 1-2", "L 0-7", "W 7-2",
"L 2-15", "W 10-6", "W 5-4", "L 1-4", "L 3-7", "L 2-3", "W 18-1",
"W 7-6", "L 2-11", "L 8-9", "W 14-1", "W 5-1", "L 3-5", "W 9-7",
"W 9-4", "L 5-9", "L 3-5", "L 7-8", "L 2-5", "L 1-2", "W 4-3"
), W_L = c("L", "L", "L", "W", "L", "W", "L", "L", "W", "L",
"W", "W", "L", "L", "L", "W", "W", "L", "L", "W", "W", "L", "W",
"W", "L", "L", "L", "L", "L", "W"), temp = c("L 5", "L 3", "L 3",
"W 3", "L 5", "W 5", "L 1", "L 0", "W 7", "L 2", "W 10", "W 5",
"L 1", "L 3", "L 2", "W 18", "W 7", "L 2", "L 8", "W 14", "W 5",
"L 3", "W 9", "W 9", "L 5", "L 3", "L 7", "L 2", "L 1", "W 4"
), RS = c(5, 3, 3, 3, 5, 5, 1, 0, 7, 2, 10, 5, 1, 3, 2, 18, 7,
2, 8, 14, 5, 3, 9, 9, 5, 3, 7, 2, 1, 4), RA = c(15, 4, 5, 2,
8, 2, 2, 7, 2, 15, 6, 4, 4, 7, 3, 1, 6, 11, 9, 1, 1, 5, 7, 4,
9, 5, 8, 5, 2, 3), Rdiff = c(-10, -1, -2, 1, -3, 3, -1, -7, 5,
-13, 4, 1, -3, -4, -1, 17, 1, -9, -1, 13, 4, -2, 2, 5, -4, -2,
-1, -3, -1, 1), absV = c(10, 1, 2, 1, 3, 3, 1, 7, 5, 13, 4, 1,
3, 4, 1, 17, 1, 9, 1, 13, 4, 2, 2, 5, 4, 2, 1, 3, 1, 1), App_Dec = c("GS-2, L",
"GS-5", "GS-3, L", "GS-7, W", "GS-6, L", "GS-7, W", "GS-7, L",
"GS-7, L", "GS-6, W", "GS-3, L", "GS-2", "GS-2", "GS-6, L", "GS-5, L",
"GS-6, L", "GS-6, W", "GS-4", "GS-4, L", "GS-2", "GS-7, W", "GS-5, W",
"GS-6", "GS-2", "GS-3", "GS-4", "GS-6, L", "GS-5", "GS-4", "GS-4",
"GS-4"), IP = c(1.1, 5, 3, 7, 6, 6.1, 7, 7, 6, 2.2, 1.2, 2, 6,
5, 5.1, 6, 4, 4, 2, 7, 5, 6, 1.2, 3, 4, 6, 5, 3.2, 4, 4), H = c(5,
5, 8, 4, 6, 4, 4, 3, 3, 8, 8, 4, 6, 3, 7, 3, 10, 7, 3, 3, 4,
3, 4, 4, 9, 6, 4, 4, 4, 4), R = c(6, 3, 5, 1, 4, 2, 2, 1, 1,
6, 6, 3, 4, 2, 3, 1, 5, 5, 5, 1, 1, 2, 4, 2, 5, 4, 2, 1, 1, 3
), ER = c(6, 3, 3, 1, 4, 1, 2, 1, 1, 6, 6, 3, 4, 2, 3, 1, 5,
4, 5, 1, 1, 2, 4, 2, 5, 1, 2, 1, 1, 3), BB = c(2, 2, 1, 1, 0,
1, 2, 2, 4, 3, 0, 1, 2, 2, 1, 2, 0, 0, 4, 2, 2, 2, 4, 1, 0, 2,
2, 2, 0, 3), SO = c(1, 10, 3, 4, 4, 7, 14, 10, 10, 5, 3, 1, 5,
2, 5, 9, 3, 3, 3, 12, 8, 6, 0, 2, 2, 9, 2, 7, 4, 3), HR = c(0,
2, 1, 0, 2, 1, 1, 1, 1, 2, 4, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0,
0, 0, 1, 1, 0, 1, 1, 0), UER = c(0, 0, 2, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0),
Pit = c(38, 113, 67, 107, 66, 95, 112, 100, 102, 76, 52,
40, 94, 81, 91, 102, 66, 71, 70, 108, 81, 100, 52, 69, 84,
103, 86, 60, 57, 70), Str = c(24, 78, 42, 68, 45, 66, 70,
70, 62, 45, 30, 25, 66, 52, 60, 68, 45, 49, 37, 74, 50, 65,
22, 41, 53, 72, 55, 39, 33, 37), GSc = c(19, 53, 29, 68,
48, 65, 73, 75, 68, 20, 18, 36, 47, 53, 46, 69, 25, 33, 29,
77, 61, 62, 27, 44, 26, 57, 51, 54, 54, 42), BF = c(12, 22,
19, 26, 23, 24, 26, 26, 24, 18, 14, 11, 26, 20, 24, 23, 21,
20, 14, 26, 21, 23, 13, 15, 21, 27, 20, 16, 15, 18), AB = c(8,
20, 18, 24, 23, 23, 23, 23, 20, 15, 13, 9, 24, 18, 22, 21,
21, 20, 9, 24, 19, 21, 8, 13, 20, 25, 18, 14, 15, 15), H2B = c(2,
0, 1, 1, 1, 0, 2, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1,
0, 0, 1, 0, 2, 2, 2, 0, 1, 0), H3B = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 0), IBB = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
HBP = c(1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), SH = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0), SF = c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0), GDP = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1), SB = c(0, 1,
1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0,
1, 0, 0, 0, 3, 0, 0, 0, 0), CS = c(0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), PO = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BK = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), WP = c(0, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0), ERA = c("40.5", "5.4", "9", "1.29", "6", "1.42",
"2.57", "1.29", "1.5", "20.25", "32.4", "13.5", "6", "3.6",
"5.0599999999999996", "1.5", "11.25", "9", "22.5", "1.29",
"1.8", "3", "21.6", "6", "11.25", "1.5", "3.6", "2.4500000000000002",
"2.25", "6.75"), WPA = c(-0.471, -0.087, -0.256, 0.34, -0.22,
0.18, 0.107, 0.219, 0.229, -0.358, -0.487, -0.186, -0.156,
0.036, -0.047, 0.049, -0.329, -0.321, -0.34, 0.193, 0.156,
0.07, -0.312, -0.042, -0.278, -0.271, 0.029, 0.02, 0.092,
-0.174), RE24 = c(-5.122, -0.193, -3.316, 2.931, -1.08, 1.509,
1.406, 2.406, 1.92, -4.641, -5.444, -1.919, -0.758, 0.679,
0.245, 2.215, -3.054, -3.054, -4.027, 2.406, 1.433, 0.92,
-3.788, -0.359, -2.812, -1.08, 0.707, 0.364, 1.166, -0.834
), aLI = c(1.45, 1.244, 0.974, 1.271, 0.965, 0.921, 0.955,
0.888, 1.066, 0.962, 0.767, 1.073, 0.941, 0.852, 1.353, 0.392,
0.857, 0.805, 0.904, 0.75, 1.037, 0.861, 1.232, 1.355, 0.914,
1.239, 1.213, 1.28, 0.748, 1.407)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))
Desired output:
The numbers starting in the second column are the total absV values for each player for each column. The last column contains the sum of all the absV values for each player where absV > 5. Only a sample of the first 3 rows are shown, and the absV values are just filler numbers.
| Player | 1 | 2 | 3 | 4 | 5 | >5 |
| deGrom | 2 | 3 | 5 | 0 | 1 | 3 |
| Matz | 2 | 3 | 5 | 0 | 1 | 3 |
Code tried (I need help getting beyond the point shown). I would prefer if the code uses dplyr:
starter %>%
select(Player, absV) %>%
group_by(Player, absV) %>%
summarize(numG= n()) %>%
arrange(Player,absV)
To do this you to bifurcate your data with rows per player >5 and <=5, then rbind them together and thereafter pivot_wider. Follow this code
library(dplyr)
library(tidyr)
df <- starter %>% group_by(Player) %>%
mutate(row = row_number()) %>%
select(Player, absV, row) %>% arrange(Player)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 5 4 1 1 NA
3 Jacob deGrom 1 1 1 17 13 2
4 Michael Wacha 3 9 5 NA NA NA
5 Rick Porcello 2 7 3 1 2 1
6 Robert Gsellman 2 1 NA NA NA NA
7 Seth Lugo 10 3 4 1 4 3
8 Steven Matz 13 NA NA NA NA NA
Note. Loading tidyverse package, at once, directly is advised.
Note-2 If you still want to sort absV before changing the data-format, add absV in arrange syntax beforehand joining them..
df <- starter %>% group_by(Player) %>%
arrange(Player, absV) %>%
mutate(row = row_number()) %>%
select(Player, absV, row)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
#this will give the following diff output
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 1 1 4 5 NA
3 Jacob deGrom 1 1 1 2 13 17
4 Michael Wacha 3 5 9 NA NA NA
5 Rick Porcello 1 1 2 2 3 7
6 Robert Gsellman 1 2 NA NA NA NA
7 Seth Lugo 1 3 3 4 4 10
8 Steven Matz 13 NA NA NA NA NA
Additional Question in comments below
Follow this code to work out frequency of each absV
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup()
#check it
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup() %>% select(Player, absV, freq)
Player absV freq
<chr> <dbl> <int>
1 Seth Lugo 10 1
2 Jacob deGrom 1 3
3 Rick Porcello 2 2
4 David Peterson 1 3
5 Michael Wacha 3 1
6 Seth Lugo 3 2
7 Jacob deGrom 1 3
8 Rick Porcello 7 1
9 David Peterson 5 1
10 Steven Matz 13 1
# ... with 20 more rows
Using data.table
library(data.table)
dcast(setDT(starter), Player ~ rowid(Player), value.var = 'absV')

Concentrate data frame information in r

I have two data frames:
> df1
2013-04-1 2013-04-2 2013-04-3 2013-04-4 2013-04-5 2013-04-6 2013-04-7 2013-04-8 2013-04-9 2013-04-10 2013-04-11
bin_1 32 489 32 32 364 19 312 0 0 0 346
bin_2 8 346 8 0 98 8 12 12 46 364 346
bin_3 9 98 346 46 9 312 6 1912 0 489 0
bin_4 4 12 9 12 0 12 0 987 9 19 12
bin_5 0 0 8 8 0 0 312 6 312 12 4
df1 contains 5 rows (bins) and 23 columns (date)
> df2
orange apple pear banana watermelon lemon
2013-04-1 1 1 1 1 0 1
2013-04-2 1 1 0 1 0 0
2013-04-3 1 1 1 1 0 1
2013-04-4 0 1 0 1 1 1
2013-04-5 1 0 0 0 1 1
df2 contains 23 rows(date) and 6 columns (types of fruits)
So now, I want to concentrate these 2 dfs into 1 big data frame that contains all the information, like:
> df3
orange apple pear banana watermelon lemon
bin_1 ? ? ? ? ? ?
bin_2 ? ? ? ? ? ?
bin_3 ? ? ? ? ? ?
bin_4 ? ? ? ? ? ?
bin_5 ? ? ? ? ? ?
But how can i concentrate the data? So for example,
on 2013-04-1,
bin_1 contains 32 fruits, bin_2 contains 8 fruits, ..., bin_5 contains 0 fruits (based on df1)
only orange, apple, pear, banana, and lemon are available (based on df2)
Q. I want my df3 to contain concentrate information, like bin_1 on average contain x amount of oranges, ...etc .How can I model this?
Code:
> dput(df1)
structure(list(`2013-04-1` = c(32, 8, 9, 4, 0), `2013-04-2` = c(489,
346, 98, 12, 0), `2013-04-3` = c(32, 8, 346, 9, 8), `2013-04-4` = c(32,
0, 46, 12, 8), `2013-04-5` = c(364, 98, 9, 0, 0), `2013-04-6` = c(19,
8, 312, 12, 0), `2013-04-7` = c(312, 12, 6, 0, 312), `2013-04-8` = c(0,
12, 1912, 987, 6), `2013-04-9` = c(0, 46, 0, 9, 312), `2013-04-10` = c(0,
364, 489, 19, 12), `2013-04-11` = c(346, 346, 0, 12, 4), `2013-04-12` = c(0,
9, 12, 46, 489), `2013-04-13` = c(32, 8, 19, 46, 0), `2013-04-14` = c(0,
987, 12, 0, 6), `2013-04-15` = c(0, 346, 4, 346, 0), `2013-04-16` = c(0,
1912, 1912, 12, 364), `2013-04-17` = c(12, 98, 32, 32, 1912),
`2013-04-18` = c(12, 12, 12, 0, 346), `2013-04-19` = c(9,
46, 98, 312, 4), `2013-04-20` = c(32, 987, 46, 9, 312), `2013-04-21` = c(4,
98, 12, 32, 12), `2013-04-22` = c(19, 0, 4, 346, 0), `2013-04-23` = c(1912,
364, 0, 0, 489)), row.names = c("bin_1", "bin_2", "bin_3",
"bin_4", "bin_5"), class = "data.frame")
> dput(df2)
structure(list(orange = c(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0), apple = c(1, 1, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), pear = c(1,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
0), banana = c(1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0), watermelon = c(0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), lemon = c(1, 0,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0
)), row.names = c("2013-04-1", "2013-04-2", "2013-04-3", "2013-04-4",
"2013-04-5", "2013-04-6", "2013-04-7", "2013-04-8", "2013-04-9",
"2013-04-10", "2013-04-11", "2013-04-12", "2013-04-13", "2013-04-14",
"2013-04-15", "2013-04-16", "2013-04-17", "2013-04-18", "2013-04-19",
"2013-04-20", "2013-04-21", "2013-04-22", "2013-04-23"), class = "data.frame")

How to frequency of consecutive rows with the same number for several columns

I have a dataset as follows:
structure(list(chr = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
1, 0, 0, 1, 1, 1, 1), leftPos = c(240000, 1080000, 1200000, 1320000,
1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 3480000,
3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 5160000,
5280000, 6e+06), chr.1 = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 1, 1, 1), leftPos.1 = c(240000, 1080000, 1200000,
1320000, 1440000, 1800000, 2400000, 2520000, 3120000, 3360000,
3480000, 3600000, 3720000, 4200000, 4560000, 4920000, 5040000,
5160000, 5280000, 6e+06), ASample = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0), Sample1 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1), Sample2 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), Sample3 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1), Sample4 = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample5 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample6 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample7 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Sample8 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample9 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1), Sample10 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample11 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample12 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample13 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), Sample14 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample15 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1)), .Names = c("chr",
"leftPos", "chr.1", "leftPos.1", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15"), row.names = c(NA,
20L), class = "data.frame")
I need to count the number of rows per column where there is more than one of the same 1 or -1
I would like to be able to count the number of consecutive rows for each column, grouped by chr, that have three consecutive 1 or -1 within a chromosome (column called chr).
The ideal output would be something like (not taken from the dput data above)
chr numberOfConsecutive1s FreqSample1 FreqSample2 FreqSample3 etc
1 2 3 2 14
1 3 5 2 2
1 4 5 0 6
1 5 4 3 5
1 6 3 0 3
1 7 7 5 7
1 8 5 0 2
1 9 54 2 6
1 10 34 77 7
2 2 6 4 2
2 3 23 34 34
2 4 5 37 2
2 5 55 24 22
2 6 2 0 11
2 7 3 14 5
2 8 2 5 77
2 9 5 23 34
2 10 5 11 34
3 1 32 0 2
So far I have tried the following which simply converts non consecutive 1s into 0s so I am left with the consecutive 1s only. I don't know how to count them up as per the desired output.
dx<-DAT_list2res
f0 <- function( colNr, dx )
{
col <- dx[,colNr]
n1 <- which( col == 1 ) # The `1`-rows.
d0 <- which( diff(col) == 0 ) # Consecutive entries are equal.
dc0 <- which( diff(dx[,1]) == 0 ) # Same chromosome.
m <- intersect( n1-1, intersect( d0, dc0 ) )
return ( setdiff( 1:nrow(dx), union(m,m+1) ) )
}
g <- function( dx )
{
for ( i in 3:ncol(dx) ) { dx[f0(i,dx),i] <- 0 }
return ( dx )
}
dx<-g(dx)
EDIT
I also tried this as suggested by bramtayl:
result =
consecFreq %>%
select(-chr) %>%
gather(variable, chr, 5:190) %>%
group_by(variable) %>%
mutate(ID =
chr %>%
lag %>%
`!=`(chr) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
count(variable, chr, ID) %>%
rename(numberOfConsecutive1s = n) %>%
count(variable, chr, numberOfConsecutive1s) %>%
spread(variable, n, fill = 0)
but it gives me an 'index out of bounds' error. If I ignore the spread line I get an odd output as well so I'm not sure this is the answer
REVISED
Based on clarifications, this approach uses the rle function for each chromosome to find runs of consecutive 1's or -1's and then table to count the number of runs for each value. This gives NA for samples which have no counts for a particular value so the last line of the code converts the NA's to 0's if this is helpful. Finally there seems to be a problem with your structure input in that Cytospongex10_SLX.9395.FastSeqK.fq.gz.res is missing from the .Names section of structure. This causes all the column names to be shifted and the last column name to be NA which can cause problems in the execution.
The code below assigns the correct names to the input data (in data.frame df) and then calculates the frequencies as described above.
colnames(data) <- c("chr",
"leftPos", "chr.1", "leftPos.1", "Cytospongex10_SLX.9395.FastSeqK.fq.gz.res", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15")
chr_labels <- sort(unique(data$chr))
sampl_freqs <- data.frame(chr=1, numberOfConsecutive1s=1, count=0)
for( sampl in colnames(data)[-(1:5)]) {
freqs <- data.frame()
for( chr in chr_labels ) {
runs <- rle(data[data$chr == chr,sampl])
freqs_chr <- data.frame(chr=chr, table(runs$length[runs$values %in% c(-1,1)], dnn = "numberOfConsecutive1s") )
freqs <- rbind(freqs, freqs_chr)
}
sampl_freqs <- merge.data.frame(sampl_freqs, freqs, by = c("chr","numberOfConsecutive1s"), all=TRUE)
colnames(sampl_freqs) <- c(head(colnames(sampl_freqs),-1),paste("Freq",sampl,sep=""))
}
# clean up from sampl_freqs definition
sampl_freqs <- sampl_freqs[,-3]
# To convert NA's to 0
sampl_freqs <- data.frame(sampl_freqs[,1:2], sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x)))
Similar to above, but uses dplyr
library(reshape2)
library(dplyr)
df <- melt(data[,-(2:5)], id.vars="chr", variable.name="sample")
sampl_freqs <- df %>% group_by(sample, chr ) %>%
do(data.frame(unclass(rle(.$value))) %>%
filter(values %in% c(-1,1)) ) %>%
group_by(sample, chr, lengths) %>%
summarize(Freq = n() ) %>%
dcast( chr + lengths ~ sample, value.var = "Freq" )
sampl_freqs <- with(sampl_freqs,data.frame( chr, numberOfConsecutive1s = lengths ,
sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x))))
I think you want something like this:
library(dplyr)
library(tidyr)
min_chunk_length = 1
result =
data %>%
rename(chromosome = chr) %>%
select(chromosome, Sample1:Sample15) %>%
gather(sample, value, Sample1:Sample15) %>%
group_by(chromosome, sample) %>%
mutate(non_zero = value %in% c(1, -1),
chunk_ID =
non_zero %>%
lag %>%
`!=`(non_zero) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
filter(non_zero = TRUE) %>%
group_by(chromosome, sample, chunk_ID) %>%
mutate(length_of_chunk = n()) %>%
filter(length_of_chunk > min_chunk_length) %>%
count(chromosome, sample) %>%
spread(sample, n, fill = 0)

Resources