Subsetting rows based on multiple columns using data.table - fastest way - r

I was wondering if there was a more elegant, less clunky and faster way to do this. I have millions of rows with ICD coding for clinical data. A short example provided below. I was to subset the dataset based on either of the columns meeting a specific set of diagnosis codes. The code below works but takes ages in R and was wondering if there is a faster way.
structure(list(eid = 1:10, mc1 = structure(c(4L, 3L, 5L, 2L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("345", "410", "413.9", "I20.1",
"I23.4"), class = "factor"), oc1 = c(350, 323, 12, 35, 413.1,
345, 345, 345, 345, 345), oc2 = structure(c(5L, 6L, 4L, 1L, 1L,
2L, 2L, 2L, 3L, 2L), .Label = c("", "345", "I20.3", "J23.6",
"K50.1", "K51.4"), class = "factor")), .Names = c("eid", "mc1",
"oc1", "oc2"), class = c("data.table", "data.frame"), row.names = c(NA,
-10L), .internal.selfref = <pointer: 0x102812578>)
The code below subsets all rows that meet the code of either "I20" or "413" (this would include all codes that have for example been coded as "I20.4" or "413.9" etc.
dat2 <- dat [substr(dat$mc1,1,3)== "413"|
substr(dat$oc1,1,3)== "413"|
substr(dat$oc2,1,3)== "413"|
substr(dat$mc1,1,3)== "I20"|
substr(dat$oc1,1,3)== "I20"|
substr(dat$oc2,1,3)== "I20"]
Is there a faster way to do this? For example can i loop through each of the columns looking for the specific codes "I20" or "413" and subset those rows?

We can specify the columns of interest in .SDcols, loop through the Subset of Data.table (.SD), get the first 3 characters with substr, check whether it is %in% a vector of values and Reduce it to a single logical vector for subsetting the rows
dat[dat[,Reduce(`|`, lapply(.SD, function(x)
substr(x, 1, 3) %chin% c('413', 'I20'))), .SDcols = 2:4]]
# eid mc1 oc1 oc2
#1: 1 I20.1 350.0 K50.1
#2: 2 413.9 323.0 K51.4
#3: 5 345 413.1
#4: 9 345 345.0 I20.3

For larger data it could help if we dont chech all rows:
minem <- function(dt, colsID = 2:4) {
cols <- colnames(dt)[colsID]
x <- c('413', 'I20')
set(dt, j = "inn", value = F)
for (i in cols) {
dt[inn == F, inn := substr(get(i), 1, 3) %chin% x]
}
dt[inn == T][, inn := NULL][]
}
n <- 1e7
set.seed(13)
dt <- dts[sample(.N, n, replace = T)]
dt <- cbind(dt, dts[sample(.N, n, replace = T), 2:4])
setnames(dt, make.names(colnames(dt), unique = T))
dt
# eid mc1 oc1 oc2 mc1.1 oc1.1 oc2.1
# 1: 8 345 345.0 345 345 345 345
# 2: 3 I23.4 12.0 J23.6 413.9 323 K51.4
# 3: 4 410 35.0 413.9 323 K51.4
# 4: 1 I20.1 350.0 K50.1 I23.4 12 J23.6
# 5: 10 345 345.0 345 345 345 345
# ---
# 9999996: 3 I23.4 12.0 J23.6 I20.1 350 K50.1
# 9999997: 5 345 413.1 I20.1 350 K50.1
# 9999998: 4 410 35.0 345 345 345
# 9999999: 4 410 35.0 410 35
# 10000000: 10 345 345.0 345 345 345 I20.3
system.time(r1 <- akrun(dt, 2:ncol(dt))) # 22.88 sek
system.time(r2 <- minem(dt, 2:ncol(dt))) # 17.72 sek
all.equal(r1, r2)
# [1] TRUE

Related

How to join two dataframes containing time varying variables in R

This seems like a simple enough thing but I can't figure it out nor find an answer online - apologies if it something obvious. I have two seperate dataframes containing the same patients with the same unique identifier. Both datasets have time varying variables - one continuous and one categorical and the time to each reading is different in the sets but have a common start point at time 1. I have tried to modify the tmerge function from survival package but without luck as I don't have a dichotomous outcome variable nor a single data set with one row per patient.
Reprex for creating the datasets below (df1 and df2) and an example of my desired combined output table for a single patient (ID 3), output gets very long if done for all 4 patients
Thanks for any possible help
df1 <- structure(list(tstart = c(1, 1, 1, 1426, 1, 560, 567), tstop = c(2049,
3426, 1426, 1707, 560, 567, 4207), category = structure(c(1L,
1L, 1L, 2L, 1L, 4L, 2L), .Label = c("none", "high", "low", "moderate"
), class = "factor"), id = c(1L, 2L, 3L, 3L, 4L, 4L, 4L)), row.names = c(NA,
-7L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(tstart = c(1, 365, 730, 1, 365, 730, 1096, 2557,
1, 365, 730, 1096, 1826, 2557, 3652, 1), tstop = c(365, 730,
1096, 365, 730, 1096, 2557, 2582, 365, 730, 1096, 1826, 2557,
3652, 4864, 365), egfr = c(66, 62, 58, 54, 50, 43, 49, 51, 106,
103, 80, 92, 97, 90, 81, 51), id = c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df_example_patient_3 <- structure(list(id = c(3L, 3L, 3L,
3L, 3L, 3L,3L, 3L, 3L), tstart = c(1, 365, 730, 1096, 1426, 1707, 1826, 2557, 3652), tstop = c(365, 730,
1096, 1426, 1707, 1826, 2557, 3652, 4864), egfr = c(106, 103, 80, 92, 92, 92, 97, 90, 81), category = c("none", "none", "none", "none", "high", "high", "high", "high", "high")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
# DF1
tstart tstop category id
<dbl> <dbl> <fct> <int>
1 1 2049 none 1
2 1 3426 none 2
3 1 1426 none 3
4 1426 1707 high 3
5 1 560 none 4
6 560 567 moderate 4
7 567 4207 high 4
# DF2
tstart tstop egfr id
<dbl> <dbl> <dbl> <int>
1 1 365 66 1
2 365 730 62 1
3 730 1096 58 1
4 1 365 54 2
5 365 730 50 2
6 730 1096 43 2
7 1096 2557 49 2
8 2557 2582 51 2
9 1 365 106 3
10 365 730 103 3
11 730 1096 80 3
12 1096 1826 92 3
13 1826 2557 97 3
14 2557 3652 90 3
15 3652 4864 81 3
16 1 365 51 4
# Combined set
id tstart tstop egfr category
<int> <dbl> <dbl> <dbl> <chr>
1 3 1 365 106 none
2 3 365 730 103 none
3 3 730 1096 80 none
4 3 1096 1426 92 none
5 3 1426 1707 92 high
6 3 1707 1826 92 high
7 3 1826 2557 97 high
8 3 2557 3652 90 high
9 3 3652 4864 81 high
I had to do it this way to really work out the details.
First, i construct a full df1 with all the timestamps, including those of df2.
then i proceed with multiple merges. This is not elegant, but it works:
library(data.table)
library(zoo)
# Proper data.tables
setDT(df1, key = c("id", "tstart"))
setDT(df2, key = c("id", "tstart"))
timestamps_by_id <- unique(rbind(
df1[, .(id, tstart)],
df1[, .(id, tstop)],
df2[, .(id, tstart)],
df2[, .(id, tstop)],
use.names = F
))
setorder(timestamps_by_id, id, tstart)
# Merge to construct full df1
df1_full <- df1[timestamps_by_id]
df1_full[, category := na.locf(category), by = id]
df1_full[, tstop := shift(tstart, -1), by = id]
setkey(df1_full, id, tstart)
# Merge with df2
result <- na.omit(df2[df1_full, roll = T])
result[, tstop := i.tstop]
print(result[id == 3, .(id, tstart, tstop, egfr, category)])
Or a more data.tabley solution using the more arcane foverlaps:
library(data.table)
# Proper data.tables
setDT(df1, key = c("id", "tstart", "tstop"))
setDT(df2, key = c("id", "tstart", "tstop"))
# We add an infinite upper range
proper_df1 <- rbind(
df1,
df1[, .SD[which.max(tstop)], by = .(id)][, .(id, tstart = tstop, tstop = Inf, category), ]
)
setkey(proper_df1, id, tstart, tstop)
overlaps <- foverlaps(df2, proper_df1, type = "any") # Overlap join
overlaps[
tstart %between% .(i.tstart, i.tstop) & tstart != 1,
i.tstart := tstart
]
overlaps[tstop %between% .(i.tstart, i.tstop), i.tstop := tstop]
print(overlaps[
id == 3,
.(id, "tstart" = i.tstart, "tstop" = i.tstop, category, egfr)
])
This messy dplyr solution seems to work for this particular dataset but don't know would it work for all datasets, the direction of the fill may need to be altered depending on particular dataset
library(tidyverse)
library(magrittr)
df1 %>%
bind_rows(df2) %>%
group_by(id) %>%
arrange(id, tstop) %>%
mutate(
tstart = case_when(
tstart < lag(tstop) ~ lag(tstop), TRUE ~ tstart)) %>%
fill(egfr, category, .direction = "updown") %>%
ungroup() %>%
filter(id == 3)
tstart tstop category id egfr
<dbl> <dbl> <fct> <int> <dbl>
1 1 365 none 3 106
2 365 730 none 3 103
3 730 1096 none 3 80
4 1096 1426 none 3 92
5 1426 1707 high 3 92
6 1707 1826 high 3 92
7 1826 2557 high 3 97
8 2557 3652 high 3 90
9 3652 4864 high 3 81

Complex dataframe values selection based on both rows and columns

I need to select some values on each row of the dataset below and compute a sum.
This is a part of my dataset.
> prova
key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP
18 3483 364 3509 b n m
19 2367 818 3924 b n m
20 3775 1591 802 b m n
21 929 3059 744 n b n
22 3732 530 1769 b n m
23 3503 2011 2932 b n b
24 3684 1424 1688 b n m
Rows are trials of the experiment and columns are the keys pressed, in temporal sequence (keypressRESP) and the amount of time of the key until the next one (key_duration).
So for example in the first trial (first row) I pressed "b" and after 3483 ms I pressed "n" and so on.
This is my dataframe
structure(list(key_duration1 = c(3483L, 2367L, 3775L, 929L, 3732L,
3503L, 3684L), key_duration2 = c(364L, 818L, 1591L, 3059L, 530L,
2011L, 1424L), key_duration3 = c(3509, 3924, 802, 744, 1769,
2932, 1688), KeyPress1RESP = structure(c(2L, 2L, 2L, 4L, 2L,
2L, 2L), .Label = c("", "b", "m", "n"), class = "factor"), KeyPress2RESP = structure(c(4L,
4L, 3L, 2L, 4L, 4L, 4L), .Label = c("", "b", "m", "n"), class = "factor"),
KeyPress3RESP = structure(c(3L, 3L, 4L, 4L, 3L, 2L, 3L), .Label = c("",
"b", "m", "n"), class = "factor")), row.names = 18:24, class = "data.frame")
I need a method for select in each row (trial) all "b" values, compute the sum(key_duration) and print the values on a new column, the same for "m".
How can i do?
I think that i need a function similar to 'apply()' but without compute every values on the row but only selected values.
apply(prova[,1:3],1,sum)
Thanks
Here is a way using data.table.
library(data.table)
setDT(prova)
# melt
prova_long <-
melt(
prova[, idx := 1:.N],
id.vars = "idx",
measure.vars = patterns("^key_duration", "^KeyPress"),
variable.name = "key",
value.name = c("duration", "RESP")
)
# aggregate
prova_aggr <- prova_long[RESP != "n", .(duration_sum = sum(duration)), by = .(idx, RESP)]
# spread and join
prova[dcast(prova_aggr, idx ~ paste0("sum_", RESP)), c("sum_b", "sum_m") := .(sum_b, sum_m), on = "idx"]
prova
Result
# key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP idx sum_b sum_m
#1: 3483 364 3509 b n m 1 3483 3509
#2: 2367 818 3924 b n m 2 2367 3924
#3: 3775 1591 802 b m n 3 3775 1591
#4: 929 3059 744 n b n 4 3059 NA
#5: 3732 530 1769 b n m 5 3732 1769
#6: 3503 2011 2932 b n b 6 6435 NA
#7: 3684 1424 1688 b n m 7 3684 1688
The idea is to reshape your data to long format, aggregate by "RESP" per row. Spread the result and join back to your initial data.
With tidyverse you can do:
bind_cols(df %>%
select_at(vars(starts_with("KeyPress"))) %>%
rowid_to_column() %>%
gather(var, val, -rowid), df %>%
select_at(vars(starts_with("key_"))) %>%
rowid_to_column() %>%
gather(var, val, -rowid)) %>%
group_by(rowid) %>%
summarise(b_values = sum(val1[val == "b"]),
m_values = sum(val1[val == "m"])) %>%
left_join(df %>%
rowid_to_column(), by = c("rowid" = "rowid")) %>%
ungroup() %>%
select(-rowid)
b_values m_values key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP
<dbl> <dbl> <int> <int> <dbl> <fct> <fct> <fct>
1 3483. 3509. 3483 364 3509. b n m
2 2367. 3924. 2367 818 3924. b n m
3 3775. 1591. 3775 1591 802. b m n
4 3059. 0. 929 3059 744. n b n
5 3732. 1769. 3732 530 1769. b n m
6 6435. 0. 3503 2011 2932. b n b
7 3684. 1688. 3684 1424 1688. b n m
First, it splits the df into two: one with variables starting with "KeyPress" and one with variables starting with "key_". Second, it transforms the two dfs from wide to long format and combines them by columns. Third, it creates a summary for "b" and "m" values according row ID. Finally, it merges the results with the original df.
You can make a logical matrix from the KeyPress columns, multiply it by the key_duration subset and then take their rowSums.
prova$b_values <- rowSums((prova[, 4:6] == "b") * prova[, 1:3])
prova$n_values <- rowSums((prova[, 4:6] == "n") * prova[, 1:3])
key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP b_values n_values
18 3483 364 3509 b n m 3483 364
19 2367 818 3924 b n m 2367 818
20 3775 1591 802 b m n 3775 802
21 929 3059 744 n b n 3059 1673
22 3732 530 1769 b n m 3732 530
23 3503 2011 2932 b n b 6435 2011
24 3684 1424 1688 b n m 3684 1424
It works because the logical values are coerced to numeric 1s or 0s, and only the values for individual keys are retained.
Extra: to generalise, you could instead use a function and tidyverse/purrr to map it:
get_sums <- function(key) rowSums((prova[, 4:6] == key) * prova[, 1:3])
keylist <- list(b_values = "b", n_values = "n", m_values = "m")
library(tidyverse)
bind_cols(prova, map_dfr(keylist, get_sums))

Using map from purrr to extract a value conditional on another variable

I am trying to map values from multiple dataframes to a primary dataframe.
The example below works partially and i am having trouble in the last part
library(tidyverse)
library(purrr)
library(data.table)
# main data
eid <- c(111,333,555,777,999)
value <-c(121,135,565,400,450)
dat <- as.data.frame(cbind(eid,value),stringsAsFactors=F)
# data from mi to be mapped to main data
eid <- c(111,222,444)
date <- c(134,234,213)
mi <- as.data.frame(cbind(eid,mi.value),stringsAsFactors=F)
# data from cva to be mapped to main data
eid <- c(333,444,555,666)
date <- c(124,132,125,457)
cva <-as.data.frame(cbind(eid,cva.value),stringsAsFactors=F)
# using map to see if eid in 'mi' and 'cva' appear in main data
each.subsequent <- map(list(mi,cva),~
as.integer(dat$eid %in% .x$eid))
names(each.subsequent) <- c("mi","cva")
each.subsequent <- as.data.frame(each.subsequent)
This next bit does not work
# maps the numerical value next to the eid
each.subsequent.value <- map(list(mi,cva),~
ifelse (dat$eid == .x$eid, .x$date,NA))
I have found a work around using right joins but this requires lot of code writing. So i had two questions:
1) Is there a 'map' way of extarcting the $date values from each of the mi and cva dataframes matching the eid?
2) What is the purpose of the '~' and .x in the code above that works?
The desired output should be
structure(list(eid = c(111, 333, 555, 777, 999), value = c(121,
135, 565, 400, 450), mi = c(1L, 0L, 0L, 0L, 0L), cva = c(0L,
1L, 1L, 0L, 0L), mi.date = c(134, NA, NA, NA, NA), cva.date = c(NA,
124, 125, NA, NA)), .Names = c("eid", "value", "mi", "cva", "mi.date",
"cva.date"), row.names = c(NA, -5L), class = "data.frame")
You can do it easily with two left_join, unsless I'm missing something (a lot more data.frames maybe):
dat %>%
left_join(mi, by ="eid") %>%
left_join(cva, by ="eid")
# eid value mi.value cva.value
# 1 111 121 134 NA
# 2 333 135 NA 124
# 3 555 565 NA 125
# 4 777 400 NA NA
# 5 999 450 NA NA
Edit:
if you have more data.frames use reduce:
list(dat, mi, cva) %>% reduce(left_join, by = "eid")

Combining (pasting) columns

I have the following data.frame
Tipo Start End Strand Accesion1 Accesion2
1 gene 197 1558 + <NA> SP_0001
2 CDS 197 1558 + NP_344554 <NA>
3 gene 1717 2853 + <NA> SP_0002
4 CDS 1717 2853 + NP_344555 <NA>
5 gene 2864 3112 + <NA> SP_0003
6 CDS 2864 3112 + NP_344556 <NA>
There are more "Tipo" values, such as tRNA, region , exon, or rRNA, but I am only interested in combining these two, gene and CDS
And I would like to get the following
Start End Accesion1 Accesion2
1 197 1558 NP_344554 SP_0001
but only when the start and End values of gene and CDS coincide. I've tried to use select, arrange and mutate with dplyr, but it is sort of complicated for me to get rid of the NAs
A dplyr version with summarize_each:
DF %>%
group_by(Start, End) %>%
summarise_each(funs(max), Accesion1, Accesion2)
Produces:
Source: local data frame [3 x 4]
Groups: Start
Start End Accesion1 Accesion2
1 197 1558 NP_344554 SP_0001
2 1717 2853 NP_344555 SP_0002
3 2864 3112 NP_344556 SP_0003
Assumes AccessionX varibles are character (does not work with factor), as well as the condition that Start End pairs contain only two values, one each of Tipo and Gene, as in your data set.
You could try
library(data.table)
setDT(df1)[, id:=cumsum(Tipo == 'gene')][,
list(Accesion1=na.omit(Accesion1), Accesion2=na.omit(Accesion2)) ,
list(id, Start, End)]
Here's a solution using aggregate():
df <- data.frame(Tipo=c('gene','CDS','gene','CDS','gene','CDS'), Start=c(197,197,1717,1717,2864,2864), End=c(1558,1558,2853,2853,3112,3112), Strand=c('+','+','+','+','+','+'), Accesion1=c(NA,'NP_344554',NA,'NP_344555',NA,'NP_344556'), Accesion2=c('SP_0001',NA,'SP_0002',NA,'SP_0003',NA) );
df2 <- df[df$Tipo%in%c('gene','CDS'),c('Start','End','Accesion1','Accesion2')];
aggregate(df2[,c('Accesion1','Accesion2')], df2[,c('Start','End')], function(x) x[!is.na(x)] );
## Start End Accesion1 Accesion2
## 1 197 1558 NP_344554 SP_0001
## 2 1717 2853 NP_344555 SP_0002
## 3 2864 3112 NP_344556 SP_0003
Precomputing df2 is necessary in case there are non-gene non-CDS rows in the original data.frame; in order to properly aggregate just the gene and CDS rows, the non-gene non-CDS rows must be excluded from both x and by. (Of course, your example data has only gene and CDS rows, so it's not technically necessary for the example data.)
This solution makes the assumption that whenever two rows have the same Start and End values, then they must be gene/CDS pairs (as opposed to gene/gene or CDS/CDS).
Here is one potential way. You choose rows with gene and CDS. Then, you group your data by Start and END. There may be groups of START/END with 1 or 3+ rows. So you want to make sure that you choose START/END groups with two rows. In addition, you want to make sure that you have both gene and CDS (length(unique(Tipo)) == 2). Finally, you take non-NA element in Accesion1 and Accesion 2.
filter(df, Tipo %in% c("gene", "CDS")) %>%
group_by(Start, End) %>%
filter(n() == 2 & length(unique(Tipo)) == 2) %>%
summarise(Accesion1 = Accesion1[!is.na(Accesion1)],
Accesion2 = Accesion2[!is.na(Accesion2)])
Here is a pseudo example.
mydf <- structure(list(Tipo = structure(c(2L, 1L, 2L, 1L, 2L, 2L), .Label = c("CDS",
"gene"), class = "factor"), Start = c(197, 197, 1717, 1717, 2864,
2864), End = c(1558, 1558, 2853, 2853, 3112, 3112), Strand = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "+", class = "factor"), Accesion1 = structure(c(NA,
1L, NA, 2L, NA, 3L), .Label = c("NP_344554", "NP_344555", "NP_344556"
), class = "factor"), Accesion2 = structure(c(1L, NA, 2L, NA,
3L, NA), .Label = c("SP_0001", "SP_0002", "SP_0003"), class = "factor")), .Names = c("Tipo",
"Start", "End", "Strand", "Accesion1", "Accesion2"), row.names = c(NA,
-6L), class = "data.frame")
Tipo Start End Strand Accesion1 Accesion2
1 gene 197 1558 + <NA> SP_0001
2 CDS 197 1558 + NP_344554 <NA>
3 gene 1717 2853 + <NA> SP_0002
4 CDS 1717 2853 + NP_344555 <NA>
5 gene 2864 3112 + <NA> SP_0003
6 gene 2864 3112 + NP_344556 <NA>
filter(mydf, Tipo %in% c("gene", "CDS")) %>%
group_by(Start, End) %>%
filter(n() == 2 & length(unique(Tipo)) == 2) %>%
summarise(Accesion1 = Accesion1[!is.na(Accesion1)],
Accesion2 = Accesion2[!is.na(Accesion2)])
# Start End Accesion1 Accesion2
#1 197 1558 NP_344554 SP_0001
#2 1717 2853 NP_344555 SP_0002

grouping, comparing, and counting rows in r

I have a data frame that looks as the following:
system Id initial final
665 9 16001 6070 6071
683 10 16001 6100 6101
696 11 16001 6101 6113
712 10 16971 6150 6151
715 11 16971 6151 6163
4966 7 4118 10238 10242
5031 9 4118 10260 10278
5088 10 4118 10279 10304
5115 11 4118 10305 10317
structure(list(system = c(9L, 10L, 11L, 10L, 11L, 7L, 9L, 10L,
11L), Id = c(16001L, 16001L, 16001L, 16971L, 16971L, 4118L, 4118L,
4118L, 4118L), initial = c(6070, 6100, 6101, 6150, 6151, 10238,
10260, 10279, 10305), final = c(6071, 6101, 6113, 6151, 6163,
10242, 10278, 10304, 10317)), .Names = c("system", "Id", "initial",
"final"), row.names = c(665L, 683L, 696L, 712L, 715L, 4966L,
5031L, 5088L, 5115L), class = "data.frame")
I would like to get a new data frame with the next structure
Id system length initial final
1 16001 9,10,11 3 6070 6113
2 16971 10,11 2 6150 6163
3 4118 7 1 10238 10242
4 4118 9,10,11 3 10260 10317
structure(list(Id = c(16001L, 16971L, 4118L, 4118L), system = structure(c(3L,
1L, 2L, 3L), .Label = c("10,11", "7", "9,10,11"), class = "factor"),
length = c(3L, 2L, 1L, 3L), initial = c(6070L, 6150L, 10238L,
10260L), final = c(6113, 6163, 10242, 10317)), .Names = c("Id",
"system", "length", "initial", "final"), class = "data.frame", row.names = c(NA,
-4L))
The grouping is by Id and the difference (between rows) in "system" field equal to one. Also I would like to get the different "system" and how many of that involved in grouping. Finally a column with the first "initial" and the last "final" involved also.
It is possible to do that in r?
Thanks.
You could use data.table. Convert "data.frame" to "data.table" (setDT), create a grouping variable "indx" by taking the difference of adjacent elements of "system" (diff(system)), cumsum the logical vector, use "Id" and "indx" as grouping variable to get the statistics.
library(data.table)
setDT(df)[,list(system=toString(system), length=.N, initial=initial[1L],
final=final[.N]), by=list(Id,indx=cumsum(c(TRUE, diff(system)!=1)))][,
indx:=NULL][]
# Id system length initial final
#1: 16001 9, 10, 11 3 6070 6113
#2: 16971 10, 11 2 6150 6163
#3: 4118 7 1 10238 10242
#4: 4118 9, 10, 11 3 10260 10317
Or based on #jazzurro's comment about using first/last functions from dplyr,
library(dplyr)
df %>%
group_by(indx=cumsum(c(TRUE, diff(system)!=1)), Id) %>%
summarise(system=toString(system), length=n(),
initial=first(initial), final=last(final))
A solution without data.table, but plyr:
library(plyr)
func = function(subdf)
{
bool = c(diff(subdf$system),1)==1
ldply(split(subdf, bool), function(u){
data.frame(system = paste(u$system, collapse=','),
Id = unique(u$Id),
length = nrow(u),
initial= head(u,1)$initial,
final = tail(u,1)$final)
})
}
ldply(split(df, df$Id), func)
# .id system length Id initial final
#1 FALSE 7 1 4118 10238 10242
#2 TRUE 9,10,11 3 4118 10260 10317
#3 TRUE 9,10,11 3 16001 6070 6113
#4 TRUE 10,11 2 16971 6150 6163

Resources