Extraction of characters and symbols using R - r

I have a column with these kind of values
id count total SEXO EDAD IDENTIF_AFILIADO
1: 952815090_12_06_Q643 4 133.34 M 39 952815090
2: 952443257_10_17_C64 9 64.32 F 5 952443257
3: 931131767_9_10_C716 2 21.88 M 1 931131767
4: 931131767_8_13_C716 15 173.70 M 1 931131767
5: 931131767_1_09_C716 1 10.94 M 0 931131767
.....
The id column has a code after the third " _ ". For instance, the first row has "952815090_12_06_Q643"
I need to extrac the code Q643.
More specifically the group of characters after the third "_" in every row. How to perform it using R?

Using regular expressions:
gsub("^.*_.*_.*_(.*)$", "\\1", id)

This should do it:
your.ids <- sapply( dat$id, function(id) {
strsplit( id, "_" )[[1]][4]
})
Or if this is a data.table, perhaps something like this:
dat[, idstring := tstrsplit( id, "_", fixed=T )[4] ]
Applied to your code it looks like this:
dat <- read.table(text=
" id count total SEXO EDAD IDENTIF_AFILIADO
1: 952815090_12_06_Q643 4 133.34 M 39 952815090
2: 952443257_10_17_C64 9 64.32 F 5 952443257
3: 931131767_9_10_C716 2 21.88 M 1 931131767
4: 931131767_8_13_C716 15 173.70 M 1 931131767
5: 931131767_1_09_C716 1 10.94 M 0 931131767
") %>% as.data.table
dat[, idstring := tstrsplit( id, "_", fixed=T )[4] ]
print( dat )
Output:
id count total SEXO EDAD IDENTIF_AFILIADO idstring
1: 952815090_12_06_Q643 4 133.34 M 39 952815090 Q643
2: 952443257_10_17_C64 9 64.32 F 5 952443257 C64
3: 931131767_9_10_C716 2 21.88 M 1 931131767 C716
4: 931131767_8_13_C716 15 173.70 M 1 931131767 C716
5: 931131767_1_09_C716 1 10.94 M 0 931131767 C716

You can delete everything until last underscore.
sub('.*_', '', df$id)
#[1] "Q643" "C64" "C716" "C716" "C716"
data
df <- structure(list(id = c("952815090_12_06_Q643", "952443257_10_17_C64",
"931131767_9_10_C716", "931131767_8_13_C716", "931131767_1_09_C716"
), count = c(4L, 9L, 2L, 15L, 1L), total = c(133.34, 64.32, 21.88,
173.7, 10.94), SEXO = c("M", "F", "M", "M", "M"), EDAD = c(39L,
5L, 1L, 1L, 0L), IDENTIF_AFILIADO = c(952815090L, 952443257L,
931131767L, 931131767L, 931131767L)),
class = "data.frame", row.names = c(NA, -5L))

Related

Complex dataframe values selection based on both rows and columns

I need to select some values on each row of the dataset below and compute a sum.
This is a part of my dataset.
> prova
key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP
18 3483 364 3509 b n m
19 2367 818 3924 b n m
20 3775 1591 802 b m n
21 929 3059 744 n b n
22 3732 530 1769 b n m
23 3503 2011 2932 b n b
24 3684 1424 1688 b n m
Rows are trials of the experiment and columns are the keys pressed, in temporal sequence (keypressRESP) and the amount of time of the key until the next one (key_duration).
So for example in the first trial (first row) I pressed "b" and after 3483 ms I pressed "n" and so on.
This is my dataframe
structure(list(key_duration1 = c(3483L, 2367L, 3775L, 929L, 3732L,
3503L, 3684L), key_duration2 = c(364L, 818L, 1591L, 3059L, 530L,
2011L, 1424L), key_duration3 = c(3509, 3924, 802, 744, 1769,
2932, 1688), KeyPress1RESP = structure(c(2L, 2L, 2L, 4L, 2L,
2L, 2L), .Label = c("", "b", "m", "n"), class = "factor"), KeyPress2RESP = structure(c(4L,
4L, 3L, 2L, 4L, 4L, 4L), .Label = c("", "b", "m", "n"), class = "factor"),
KeyPress3RESP = structure(c(3L, 3L, 4L, 4L, 3L, 2L, 3L), .Label = c("",
"b", "m", "n"), class = "factor")), row.names = 18:24, class = "data.frame")
I need a method for select in each row (trial) all "b" values, compute the sum(key_duration) and print the values on a new column, the same for "m".
How can i do?
I think that i need a function similar to 'apply()' but without compute every values on the row but only selected values.
apply(prova[,1:3],1,sum)
Thanks
Here is a way using data.table.
library(data.table)
setDT(prova)
# melt
prova_long <-
melt(
prova[, idx := 1:.N],
id.vars = "idx",
measure.vars = patterns("^key_duration", "^KeyPress"),
variable.name = "key",
value.name = c("duration", "RESP")
)
# aggregate
prova_aggr <- prova_long[RESP != "n", .(duration_sum = sum(duration)), by = .(idx, RESP)]
# spread and join
prova[dcast(prova_aggr, idx ~ paste0("sum_", RESP)), c("sum_b", "sum_m") := .(sum_b, sum_m), on = "idx"]
prova
Result
# key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP idx sum_b sum_m
#1: 3483 364 3509 b n m 1 3483 3509
#2: 2367 818 3924 b n m 2 2367 3924
#3: 3775 1591 802 b m n 3 3775 1591
#4: 929 3059 744 n b n 4 3059 NA
#5: 3732 530 1769 b n m 5 3732 1769
#6: 3503 2011 2932 b n b 6 6435 NA
#7: 3684 1424 1688 b n m 7 3684 1688
The idea is to reshape your data to long format, aggregate by "RESP" per row. Spread the result and join back to your initial data.
With tidyverse you can do:
bind_cols(df %>%
select_at(vars(starts_with("KeyPress"))) %>%
rowid_to_column() %>%
gather(var, val, -rowid), df %>%
select_at(vars(starts_with("key_"))) %>%
rowid_to_column() %>%
gather(var, val, -rowid)) %>%
group_by(rowid) %>%
summarise(b_values = sum(val1[val == "b"]),
m_values = sum(val1[val == "m"])) %>%
left_join(df %>%
rowid_to_column(), by = c("rowid" = "rowid")) %>%
ungroup() %>%
select(-rowid)
b_values m_values key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP
<dbl> <dbl> <int> <int> <dbl> <fct> <fct> <fct>
1 3483. 3509. 3483 364 3509. b n m
2 2367. 3924. 2367 818 3924. b n m
3 3775. 1591. 3775 1591 802. b m n
4 3059. 0. 929 3059 744. n b n
5 3732. 1769. 3732 530 1769. b n m
6 6435. 0. 3503 2011 2932. b n b
7 3684. 1688. 3684 1424 1688. b n m
First, it splits the df into two: one with variables starting with "KeyPress" and one with variables starting with "key_". Second, it transforms the two dfs from wide to long format and combines them by columns. Third, it creates a summary for "b" and "m" values according row ID. Finally, it merges the results with the original df.
You can make a logical matrix from the KeyPress columns, multiply it by the key_duration subset and then take their rowSums.
prova$b_values <- rowSums((prova[, 4:6] == "b") * prova[, 1:3])
prova$n_values <- rowSums((prova[, 4:6] == "n") * prova[, 1:3])
key_duration1 key_duration2 key_duration3 KeyPress1RESP KeyPress2RESP KeyPress3RESP b_values n_values
18 3483 364 3509 b n m 3483 364
19 2367 818 3924 b n m 2367 818
20 3775 1591 802 b m n 3775 802
21 929 3059 744 n b n 3059 1673
22 3732 530 1769 b n m 3732 530
23 3503 2011 2932 b n b 6435 2011
24 3684 1424 1688 b n m 3684 1424
It works because the logical values are coerced to numeric 1s or 0s, and only the values for individual keys are retained.
Extra: to generalise, you could instead use a function and tidyverse/purrr to map it:
get_sums <- function(key) rowSums((prova[, 4:6] == key) * prova[, 1:3])
keylist <- list(b_values = "b", n_values = "n", m_values = "m")
library(tidyverse)
bind_cols(prova, map_dfr(keylist, get_sums))

Split multiple columns into multiple columns using r

I have a txt file which contains 20 columns and 300 rows. The sample of my data is given below.
id sub A1 A2 B1 B2 C1
96 AAA 01:01:01:01/01:01:01:02N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
97 AAA 03:01:01:01/03:01:01:02N 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
98 AAA 01:01:01:01/01:01:01:02N/01:22N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:09:01/07:01:02
99 AAA 03:01:01:01 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:08:01/07:01:02
I need to seperate the columns (A1,A2,B1....) with the seperator "/" using r.
The output would be:
id sub A1_1 A1_2 A2 B1_1 B1_2 B2_1 B2_2 ..
96 AAA 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 ...
I could find functions to split one columns into multiple columns. But I could not find a solution to achieve this.
Here is a tidyverse solution.
library(tidyverse)
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
# A tibble: 4 x 13
# id sub A1_1 A1_2 A1_3 A2_1 B1_1 B1_2 B2_1 B2_2 B2_3 C1_1 C1_2
#* <fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 96 AAA 01:01:01:01 01:01:01:02N <NA> 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#2 97 AAA 03:01:01:01 03:01:01:02N <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#3 98 AAA 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
#4 99 AAA 03:01:01:01 <NA> <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
After gathering columns all columns except the first and the second (-c(1:2)), I used tidyr::separate_rows to separate the values in newly created column value by "/". After creating a new column key2 which is column key with the extension _1:number of separators, I unselected column key and spread column key2 by value.
data
df <- structure(list(id = structure(1:4, .Label = c("96", "97",
"98", "99"), class = "factor"), sub = structure(c(1L,
1L, 1L, 1L), .Label = "AAA", class = "factor"), A_A1 = structure(c(1L,
4L, 2L, 3L), .Label = c("01:01:01:01/01:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01", "03:01:01:01/03:01:01:02N"), class = "factor"),
A_A2 = structure(c(1L, 2L, 1L, 2L), .Label = c("29:02:01",
"30:08:01"), class = "factor"), B_B1 = structure(c(1L,
2L, 1L, 2L), .Label = c("08:01:01/08:19N", "09:02:01/08:19N"
), class = "factor"), B_B2 = structure(c(1L, 1L, 1L, 1L
), .Label = "44:03:01/44:03:03/44:03:04", class = "factor"),
C1 = structure(c(1L, 1L, 3L, 2L), .Label = c("07:01:01/07:01:02",
"07:08:01/07:01:02", "07:09:01/07:01:02"), class = "factor")), .Names = c("id",
"sub", "A_A1", "A_A2", "B_B1", "B_B2", "C_C1"), class = "data.frame", row.names = c(NA,
-4L))
I'd take an approach like the following:
library(data.table)
setDT(df) # convert to a data.table
# identify the columns you want to split
cols <- grep("^HLA", names(df), value = TRUE)
# loop through them and split them
# assign them back to the data.table, by reference
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
Here's the result:
df[]
# id sub HLA_A1_1 HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N NA 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01 NA NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
Aside from being easier to scale than the accepted answer (things aren't really hard-coded), this is at least twice as fast as that approach, and a lot faster than the "tidyverse" approach, which is quite inefficient because it first makes the data very long before going back into a wide format.
Benchmarks
To get a sense of the performance difference, try the following:
Test functions
myfun <- function(df) {
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
df[]
}
tidyfun <- function(df) {
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
}
getIt <- function(df,col) {
x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
q <- colsplit(string = as.character(df[,col]),pattern="/",
names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q)
}
reshape2fun <- function(dfdf) {
cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}
4 rows....
library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# myfun(copy(df)) 241.55 272.5965 625.7634 359.148 380.0395 28632.94 100
# reshape2fun(dfdf) 5076.24 5368.3835 5841.8784 5539.577 5639.8765 34176.13 100
# tidyfun(df) 37864.68 39435.1915 41152.5916 39801.499 40489.7055 70019.04 100
10,000 rows....
biggerdf <- rbindlist(replicate(2500, df, FALSE)) # nrow = 10,000
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(copy(biggerdf)) 50.87452 52.0059 54.59288 53.03503 53.79347 68.69892 10
# reshape2fun(dfdf) 120.90291 124.3893 137.54154 126.06213 157.50532 159.15069 10
# tidyfun(biggerdf) 1312.75422 1350.6651 1394.93082 1358.21612 1373.86793 1732.86521 10
1,000,000 rows....
BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE)) # nrow = 1,000,000
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf)) # > 2 minutes!
# user system elapsed
# 141.373 1.048 142.403
microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(copy(BIGGERdf)) 5.180048 5.574677 6.026515 5.764467 6.498967 7.114415 5
# reshape2fun(dfdf) 8.858202 9.095027 9.629969 9.264896 10.192161 10.739560 5
I suggest a reshape2 solution taking care of not knowing the number of parts:
> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N",
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01",
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N",
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04",
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02",
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2",
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")
add this function:
library("reshape2", lib.loc="~/R/win-library/3.3")
getIt <- function(df,col) {
x <- max(sapply(strsplit(df[,col],split="/"),length)) ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }
after you have this function you can easily do:
> getIt(pz1,3)
HLA_A1_A HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N
2 03:01:01:01 03:01:01:02N
3 01:01:01:01 01:01:01:02N 01:22N
4 03:01:01:01
and a simple cbind with the original dataframe (with or without the original columns) :
> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
id sub HLA_A1_A HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
I second #Sotos advice, it is important to write a reproducible example so the focus is only on the problem at hand.
I came up with this fake data to try to answer your question:
> df <- data.frame(
+ id = c(1:5),
+ sub = sample(c("GBR", "BRA"), size = 5, replace = T),
+ HLA_A = paste0(rep("01:01", 5), "/", rep("01:02N")),
+ HLA_B = paste0(rep("01:03", 5), "/", "01:42N", "/", "32:20"),
+ HLA_C = paste0(rep("01:03", 5)), stringsAsFactors = F)
>
>
> df
id sub HLA_A HLA_B HLA_C
1 1 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
2 2 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
3 3 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
4 4 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
5 5 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
You can use strsplit() to split the column by a given character (in this case "/"). Use do.call(rbind, .) to bind the lists in column format. Repeat this process for the columns you wish to target and them bind them all with the id and sub columns. Here is the solution:
Without using any dependencies:
> col.ind <- grep(x = names(df), pattern = "HLA", value = T, ignore.case = T) # your target columns
>
> # lapply to loop the column split process, output is a list, so you need to columb-bind the resulting objects
>
> cols.list <- lapply(seq_along(col.ind), function(x){
+
+ p1 <- do.call(rbind, strsplit(df[[col.ind[[x]]]], split = "/")) # split col by "/"
+
+ p2 <- data.frame(p1, stringsAsFactors = F) # make it into a data.frame
+
+ i <- ncol(p2) # this is an index placeholder that will enable you to rename the recently split columns in a sequential manner
+
+ colnames(p2) <- paste0(col.ind[[x]], c(1:i)) # rename columns
+
+ return(p2) # return the object of interest
+ }
+ )
>
>
> new.df <- cbind(df[1:2], do.call(cbind, cols.list)) # do.call once again to bind the lapply object and column-bind those with the first two columns of your initial data.frame
> new.df
id sub HLA_A1 HLA_A2 HLA_B1 HLA_B2 HLA_B3 HLA_C1
1 1 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
2 2 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03
3 3 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
4 4 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
5 5 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03

Merging of 2 rows if the date is the same or +- 7 days, and the ID is the same

So i've been trying to get my head around this but i can't figure out how to do it.
This is an example:
ID Hosp. date Discharge date
1 2006-02-02 2006-02-04
1 2006-02-04 2006-02-18
1 2006-02-22 2006-03-24
1 2008-08-09 2008-09-14
2 2004-01-03 2004-01-08
2 2004-01-13 2004-01-15
2 2004-06-08 2004-06-28
What i want is a way to combine rows by ID, IF the discarge date is the same as the Hosp. date (or +-7 days) in the next row. So it would look like this:
ID Hosp. date Discharge date
1 2006-02-02 2006-03-24
1 2008-08-09 2008-09-14
2 2004-01-03 2004-01-15
2 2004-06-08 2004-06-28
Using the data.table-package:
# load the package
library(data.table)
# convert to a 'data.table'
setDT(d)
# make sure you have the correct order
setorder(d, ID, Hosp.date)
# summarise
d[, grp := cumsum(Hosp.date > (shift(Discharge.date, fill = Discharge.date[1]) + 7))
, by = ID
][, .(Hosp.date = min(Hosp.date), Discharge.date = max(Discharge.date))
, by = .(ID,grp)]
you get:
ID grp Hosp.date Discharge.date
1: 1 0 2006-02-02 2006-03-24
2: 1 1 2008-08-09 2008-09-14
3: 2 0 2004-01-03 2004-01-15
4: 2 1 2004-06-08 2004-06-28
The same logic with dplyr:
library(dplyr)
d %>%
arrange(ID, Hosp.date) %>%
group_by(ID) %>%
mutate(grp = cumsum(Hosp.date > (lag(Discharge.date, default = Discharge.date[1]) + 7))) %>%
group_by(grp, add = TRUE) %>%
summarise(Hosp.date = min(Hosp.date), Discharge.date = max(Discharge.date))
Used data:
d <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L),
Hosp.date = structure(c(13181, 13183, 13201, 14100, 12420, 12430, 12577), class = "Date"),
Discharge.date = structure(c(13183, 13197, 13231, 14136, 12425, 12432, 12597), class = "Date")),
.Names = c("ID", "Hosp.date", "Discharge.date"), class = "data.frame", row.names = c(NA, -7L))

Split a nested list of a dataframe column into different columns

I have tried related solutions but they do not work for my case. I have a dataframe that has a nested list in one column and i want to split this list and put it in columns.The list contains another list with the time stamp for each month(ts) and the consumption for each month(v). The dataframe is:
id monthly_consum
1 112 list1
2 34 list2
3 54 list3
where
list1<-list(list(ts = "2016-01-01T00:00:00+01:00", v = 466.6),list(ts = "2016-02-01T00:00:00+01:00", v = 565.6),
list(ts = "2016-03-01T00:00:00+01:00", v = 765.6),list(ts = "2016-04-01T00:00:00+01:00", v = 888.6),
list(ts = "2016-05-01T00:00:00+01:00", v = 465),list(ts = "2016-06-01T00:00:00+01:00", v = 465.6),
list(ts = "2016-07-01T00:00:00+01:00", v = 786),list(ts = "2016-08-01T00:00:00+01:00", v = 435),
list(ts = "2016-09-01T00:00:00+01:00", v = 568),list(ts = "2016-10-01T00:00:00+01:00", v = 678),
list(ts = "2016-11-01T00:00:00+01:00", v = 522),list(ts = "2016- 12-01T00:00:00+01:00", v = 555))
list2<-list(list(ts = "2016-01-01T00:00:00+01:00", v = 333.6),list(ts = "2016-02-01T00:00:00+01:00", v = 565.6),
list(ts = "2016-03-01T00:00:00+01:00", v = 765.6),list(ts = "2016-04-01T00:00:00+01:00", v = 333.6),
list(ts = "2016-05-01T00:00:00+01:00", v = 465),list(ts = "2016-06-01T00:00:00+01:00", v = 465.6),
list(ts = "2016-07-01T00:00:00+01:00", v = 786),list(ts = "2016-08-01T00:00:00+01:00", v = 435),
list(ts = "2016-09-01T00:00:00+01:00", v = 568),list(ts = "2016-10-01T00:00:00+01:00", v = 678),
list(ts = "2016-11-01T00:00:00+01:00", v = 522),list(ts = "2016-12-01T00:00:00+01:00", v = 555))
list3<-list(list(ts = "2016-01-01T00:00:00+01:00", v = 323.6),list(ts = "2016-02-01T00:00:00+01:00", v = 565.6),
list(ts = "2016-03-01T00:00:00+01:00", v = 333.6),list(ts = "2016-04-01T00:00:00+01:00", v = 888.6),
list(ts = "2016-05-01T00:00:00+01:00", v = 465),list(ts = "2016-06-01T00:00:00+01:00", v = 465.6),
list(ts = "2016-07-01T00:00:00+01:00", v = 786),list(ts = "2016-08-01T00:00:00+01:00", v = 435),
list(ts = "2016-09-01T00:00:00+01:00", v = 568),list(ts = "2016-10-01T00:00:00+01:00", v = 678),
list(ts = "2016-11-01T00:00:00+01:00", v = 522),list(ts = "2016-12-01T00:00:00+01:00", v = 555))
I would like to split the list and create a dataframe which will have one of the 2 following formats:
id ts.1 cons.1 ts.2 cons.2 ts.3 etc..
1 112 2016-01-01T00:00:00+01:00 466.6 2016-02.. ... ...
2 34 2016-01-01T00:00:00+01:00 333.6 2016-02.. ... ...
3 54 2016-01-01T00:00:00+01:00 323.6 2016-02.. ... ...
OR
id ts consumption
112 2016-01-01T00:00:00+01:00 466.6
112 2016-02-01T00:00:00+01:00 565.6
112 2016-03-01T00:00:00+01:00 765.6
112 2016-04-01T00:00:00+01:00 888.6
112 2016-05-01T00:00:00+01:00 465
112 2016-06-01T00:00:00+01:00 465.6
112 2016-07-01T00:00:00+01:00 786
112 2016-08-01T00:00:00+01:00 435
112 2016-09-01T00:00:00+01:00 568
112 2016-10-01T00:00:00+01:00 678
112 2016-11-01T00:00:00+01:00 522
112 2016-12-01T00:00:00+01:00 555
34 2016-01-01T00:00:00+01:00 466.6
34 2016-02-01T00:00:00+01:00 333.6
34 2016-03-01T00:00:00+01:00 323.6
etc............
could you help me? I am using data.frame(matrix(unlist..)) but it does not give the format that i want. When I use rbind list i get:
"Error in rbindlist(....) :
Item 1 of list input is not a data.frame, data.table or list"
Thank you in advance!
UPDATE
Using dput i would get (in the real problem):
>dput(locs_total[9:12,1:5])
structure(list(X.dep_id. = c("34", "34", "34", "34"), X.loc_id. = c("17761",
"17406", "23591", "27838"), X.surface. = c("200", "1250", "54",
"150"), X.sector. = c("HOUSING", "SMALL-STORE-FOOD", "LIBRARY",
"OFFICE-BUILDING"),
X.avg_cons_main. = list(list(structure(list(
ts = "2016-01-01T00:00:00+01:00", v = 466.65), .Names = c("ts",
"v")), structure(list(ts = "2016-02-01T00:00:00+01:00", v = 406.45),
.Names = c("ts",
"v")), structure(list(ts = "2016-03-01T00:00:00+01:00", v = 483.35),
.Names = c("ts",
"v")), structure(list(ts = "2016-04-01T00:00:00+02:00", v = 79.45), .
Names = c("ts",
"v"))), NULL, NULL, NULL)), .Names = c("X.dep_id.", "X.loc_id.",
"X.surface.", "X.sector.", "X.avg_cons_main."
), row.names = c("9", "10", "11", "12"), class = "data.frame")
If the ids are also in the lists, you can use dplyr::bind_rows
dplyr::bind_rows(list1, list2, list3)
# A tibble: 36 × 2
ts v
<chr> <dbl>
1 2016-01-01T00:00:00+01:00 466.6
2 2016-02-01T00:00:00+01:00 565.6
3 2016-03-01T00:00:00+01:00 765.6
4 2016-04-01T00:00:00+01:00 888.6
5 2016-05-01T00:00:00+01:00 465.0
6 2016-06-01T00:00:00+01:00 465.6
7 2016-07-01T00:00:00+01:00 786.0
8 2016-08-01T00:00:00+01:00 435.0
9 2016-09-01T00:00:00+01:00 568.0
10 2016-10-01T00:00:00+01:00 678.0
# ... with 26 more rows
To add IDs from another df
library(dplyr)
ids <- data_frame(list_id = c(112, 34, 54),
monthly_consum = c("list1", "list2", "list3"))
If we consider nested lists, you can use purrr:map as follows:
-combine the three lists in one list
k <- list(list1, list2, list3)
-use map to bind_rows in each column independently
k1 <- purrr:: map(k, bind_rows)
-use the ids as names for the lists
names(k1) <- ids$list_id
-bind_rows using .id
bind_rows(k1, .id = "id")
# A tibble: 36 × 3
id ts v
<chr> <chr> <dbl>
1 112 2016-01-01T00:00:00+01:00 466.6
2 112 2016-02-01T00:00:00+01:00 565.6
3 112 2016-03-01T00:00:00+01:00 765.6
4 112 2016-04-01T00:00:00+01:00 888.6
5 112 2016-05-01T00:00:00+01:00 465.0
6 112 2016-06-01T00:00:00+01:00 465.6
7 112 2016-07-01T00:00:00+01:00 786.0
8 112 2016-08-01T00:00:00+01:00 435.0
9 112 2016-09-01T00:00:00+01:00 568.0
10 112 2016-10-01T00:00:00+01:00 678.0
We can loop through the list
res <- do.call(rbind, Map(cbind, id = df1$id, lapply(mget(df1$monthly_consum),
function(x) do.call(rbind.data.frame, x))))
names(res)[3] <- "consumption"
row.names(res) <- NULL
head(res, 14)
# id ts consumption
#1 112 2016-01-01T00:00:00+01:00 466.6
#2 112 2016-02-01T00:00:00+01:00 565.6
#3 112 2016-03-01T00:00:00+01:00 765.6
#4 112 2016-04-01T00:00:00+01:00 888.6
#5 112 2016-05-01T00:00:00+01:00 465.0
#6 112 2016-06-01T00:00:00+01:00 465.6
#7 112 2016-07-01T00:00:00+01:00 786.0
#8 112 2016-08-01T00:00:00+01:00 435.0
#9 112 2016-09-01T00:00:00+01:00 568.0
#10 112 2016-10-01T00:00:00+01:00 678.0
#11 112 2016-11-01T00:00:00+01:00 522.0
#12 112 2016- 12-01T00:00:00+01:00 555.0
#13 34 2016-01-01T00:00:00+01:00 333.6
#14 34 2016-02-01T00:00:00+01:00 565.6
data
df1 <- structure(list(id = c(112L, 34L, 54L), monthly_consum = c("list1",
"list2", "list3")), .Names = c("id", "monthly_consum"),
class = "data.frame", row.names = c("1", "2", "3"))

Sort data by factor and output into a matrix (or df) R

I have looked through other posts and I think I have an idea of what I could do, but I want to be clear!
I have a very large data frame that contains 4 variables and a number of rows.
Chain ResId ResNum Energy
1 C O17 500 -37.03670
2 A ARG 8 -0.84560
3 A LEU 24 -0.56739
4 A ASP 25 -0.98583
5 B ARG 8 -0.64880
6 B LEU 24 -0.58380
7 B ASP 25 -0.85930
Each row contains CHAIN (A, B, or C), ResID, ResNum, and Energy. I would like to sort this data so that all of the energy values belonging to a specific Resid and num in each chain are clustered together. By cluster I mean all of the values for "ARG 8" are grouped or all of the rows containing "ARG 8" are grouped. I don't know which is more efficient. Ideally, I would like the output for all residues to be
ARG 8
0.000
0.000
0.000
where the "0.000" are the energy values for ARG 8 or O17 and so on.
Sorry for the header breaks, I wanted the data to be clean, but I can't insert images.
data
structure(list(Chain = structure(c(3L, 1L, 1L, 1L, 2L, 2L, 2L
), .Label = c("A", "B", "C"), class = "factor"), ResId = structure(c(4L,
1L, 3L, 2L, 1L, 3L, 2L), .Label = c("ARG", "ASP", "LEU", "O17"
), class = "factor"), ResNum = c(500L, 8L, 24L, 25L, 8L, 24L,
25L), Energy = c(-37.0367, -0.8456, -0.56739, -0.98583, -0.6488,
-0.5838, -0.8593)), .Names = c("Chain", "ResId", "ResNum", "Energy"
), class = "data.frame", row.names = c(NA, -7L))
If you want to convert to wide format
library(reshape2)
dcast(df, ResId+ResNum~paste0('Energy.',Chain), value.var='Energy')
# ResId ResNum Energy.A Energy.B Energy.C
#1 ARG 8 -0.84560 -0.6488 NA
#2 ASP 25 -0.98583 -0.8593 NA
#3 LEU 24 -0.56739 -0.5838 NA
#4 O17 500 NA NA -37.0367
After your edit, the output you are most likely looking for is:
library(reshape2)
dcast(df, ResId~Chain, value.var= 'Energy')
ResId A B C
1 ARG -0.84560 -0.6488 NA
2 ASP -0.98583 -0.8593 NA
3 LEU -0.56739 -0.5838 NA
4 O17 NA NA -37.0367
This will put the values together. You can further specify based on your desired output.
df[order(df$ResId), ]
Chain ResId ResNum Energy
2 A ARG 8 -0.84560
5 B ARG 8 -0.64880
4 A ASP 25 -0.98583
7 B ASP 25 -0.85930
3 A LEU 24 -0.56739
6 B LEU 24 -0.58380
1 C O17 500 -37.03670
#With dplyr
library(dplyr)
df %>%
arrange(ResId)
Chain ResId ResNum Energy
1 A ARG 8 -0.84560
2 B ARG 8 -0.64880
3 A ASP 25 -0.98583
4 B ASP 25 -0.85930
5 A LEU 24 -0.56739
6 B LEU 24 -0.58380
7 C O17 500 -37.03670
Data
df <- read.table(text = '
Chain ResId ResNum Energy
C O17 500 -37.0367
A ARG 8 -0.8456
A LEU 24 -0.56739
A ASP 25 -0.98583
B ARG 8 -0.6488
B LEU 24 -0.5838
B ASP 25 -0.8593', header=T)
Try this:
df <- df[order(df$Chain, df$ResId, df$ResNum),]
where df is the name of your dataframe. This should order it for you.

Resources