I have a data table like (data is not necessarily ordered by 'col1')
col0 col1 col2
1: abc 1 a
2: abc 2 b
3: abc 3 c
4: abc 4 d
5: abc 5 e
6: def 1 a
7: def 2 b
8: def 3 c
9: def 4 d
10: def 5 e
I want to reshape it the following way
col0 col1 col2 new_1 new_2 new_3 new_4
1: abc 1 a NA NA NA NA
2: abc 2 b a NA NA NA
3: abc 3 c b a NA NA
4: abc 4 d c b a NA
5: abc 5 e d c b a
6: def 1 a NA NA NA NA
7: def 2 b a NA NA NA
8: def 3 c b a NA NA
9: def 4 d c b a NA
10: def 5 e d c b a
Basically I want to get previously occurred values of col2 for each row in the same row as above and if there is none the corresponding new column should say NA.
I can of course do it by merge on col2 5 times but I need to do this on a large table (in that case I will have to merge 20-30 times).
What is the best way to achieve it in R in 1 or 2 lines?
We can use shift from the devel version of data.table i.e. v1.9.5 (Instructions to install the devel version are here. By default, the type in shift is lag. We can specify n as a vector, in this case 1:4. We assign (:=) the output to new columns.
library(data.table)#v1.9.5+
DT[, paste('new', 1:4, sep="_") := shift(col2, 1:4)]
DT
# col1 col2 new_1 new_2 new_3 new_4
#1: 1 a NA NA NA NA
#2: 2 b a NA NA NA
#3: 3 c b a NA NA
#4: 4 d c b a NA
#5: 5 e d c b a
For the new dataset 'DT2', we need to group by 'col0' and then do the shift on 'col2'
DT2[, paste('new', 1:4, sep="_") := shift(col2, 1:4), by = col0]
DT2
# col0 col1 col2 new_1 new_2 new_3 new_4
# 1: abc 1 a NA NA NA NA
# 2: abc 2 b a NA NA NA
# 3: abc 3 c b a NA NA
# 4: abc 4 d c b a NA
# 5: abc 5 e d c b a
# 6: def 1 a NA NA NA NA
# 7: def 2 b a NA NA NA
# 8: def 3 c b a NA NA
# 9: def 4 d c b a NA
#10: def 5 e d c b a
data
df1 <- structure(list(col1 = 1:5, col2 = c("a", "b", "c", "d", "e"),
new_1 = c(NA, "a", "b", "c", "d"), new_2 = c(NA, NA, "a",
"b", "c"), new_3 = c(NA, NA, NA, "a", "b"), new_4 = c(NA,
NA, NA, NA, "a")), .Names = c("col1", "col2", "new_1", "new_2",
"new_3", "new_4"), class = "data.frame", row.names = c(NA, -5L
))
DT <- as.data.table(df1)
df2 <- structure(list(col0 = c("abc", "abc", "abc", "abc", "abc",
"def",
"def", "def", "def", "def"), col1 = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L), col2 = c("a", "b", "c", "d", "e", "a", "b",
"c", "d", "e")), .Names = c("col0", "col1", "col2"),
class = "data.frame", row.names = c(NA, -10L))
DT2 <- as.data.table(df2)
Related
The solution to this simple problem has eluded me for several hours. I have a data table in which a value is identified by several classification variables (A, B, L). Where there are observations characterized by duplicate classification variables A & B, I want to retain the one that has the highest 'L'. So, if I have a table generated with this code
set.seed(17)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
Making the following:
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
4: a d 1 5
5: a d 2 2
6: b a 1 8
7: b b 1 7
8: b b 2 1
9: b c 1 2
10: b d 1 2
I have tried commands such as
setkey(DT,A,B,L)
DT[ , .(A,B,L,val) , mult="last"]
but I'm just not getting something.
I want a resulting table that looks like this
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
5: a d 2 2
6: b a 1 8
8: b b 2 1
9: b c 1 2
10: b d 1 2
DT[, lapply(.SD, last), .(A,B)])
should also work and seems to be a bit faster than the merge solution
solution option
library(data.table)
dt <- structure(list(A = c("a", "a", "a", "a", "a", "b", "b", "b",
"b", "b"), B = c("a", "b", "c", "d", "d", "a", "b", "b", "c",
"d"), L = c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L), val = c(1L,
10L, 3L, 5L, 2L, 8L, 7L, 1L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-10L))
setDT(dt)
merge(dt[, list(L = last(L)), by =list(A, B)], dt)
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 10
#> 3: a c 1 3
#> 4: a d 2 2
#> 5: b a 1 8
#> 6: b b 2 1
#> 7: b c 1 2
#> 8: b d 1 2
Created on 2021-03-24 by the reprex package (v1.0.0)
set.seed(17)
library(data.table)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
result <- DT[DT[, .I[L == max(L)], by = list(A, B)]$V1]
> result
A B L val
1: a a 1 1
2: a b 1 1
3: a c 1 3
4: a d 2 12
5: b a 1 6
6: b b 2 2
7: b c 1 3
8: b d 1 5
Here's how I'd do it (without mult)
DT[order(-L), .SD[1], .(A,B)]
With mult something like this would do it - note that Im doing an actual join here
DT[order(L)][unique(DT[, .(A, B)]), on = c('A', 'B'), mult = 'last']
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 1
#> 3: a c 1 3
#> 4: a d 2 12
#> 5: b a 1 6
#> 6: b b 2 2
#> 7: b c 1 3
#> 8: b d 1 5
I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.
I have a named list, and I want to bind its elements. I am a big fan of data.table::rbindlist() but it removes NA entries. Is there anyway I can preserve NA entries?
Here's my code:
dput(Result)
structure(list(a = c(1L, 3L), b = c(2L, 4L), c = 4L, d = integer(0),
e = integer(0), f = integer(0)), .Names = c("a", "b", "c",
"d", "e", "f"))
Here's what I tried for data.table
Attempt1 : Using data.table
Result1<-data.table::rbindlist(lapply(Result, as.data.frame),use.names=TRUE, fill=TRUE, idcol="Name")
However, I lost d and e.
Attempt2 : Using dplyr
dplyr::bind_rows(lapply(Result, as.data.frame))
Again, I lost d and e.
Expected Output:
Result1
Name X[[i]]
1: a 1
2: a 3
3: b 2
4: b 4
5: c 4
6: d NA
7: e NA
8: f NA
I'd appreciate any help.
Here you go:
Result = structure(list(a = c(1L, 3L), b = c(2L, 4L), c = 4L, d = integer(0),
e = integer(0), f = integer(0)), .Names = c("a", "b", "c",
"d", "e", "f"))
Result2 = lapply(Result, function(x){
if(length(x)==0){NA}else{x}
})
Result3 = data.table::rbindlist(lapply(Result2,
as.data.frame),use.names=TRUE, fill=TRUE, idcol="Name")
The problem is that integer(0) is not NA, so you must convert them to NA as shown for Result2.
Result:
> Result3
Name X[[i]]
1: a 1
2: a 3
3: b 2
4: b 4
5: c 4
6: d NA
7: e NA
8: f NA
Replace the zero length elements with NA, then use rbindlist.
Result[!lengths(Result)] <- NA
## or
## is.na(Result) <- !lengths(Result)
rbindlist(lapply(Result, as.data.table), id = "Name")
# Name V1
# 1: a 1
# 2: a 3
# 3: b 2
# 4: b 4
# 5: c 4
# 6: d NA
# 7: e NA
# 8: f NA
You could also do this in base R with
is.na(Result) <- !lengths(Result)
data.frame(
Name = rep(names(Result), lengths(Result)),
V1 = unlist(Result, use.names = FALSE)
)
# Name V1
# 1 a 1
# 2 a 3
# 3 b 2
# 4 b 4
# 5 c 4
# 6 d NA
# 7 e NA
# 8 f NA
I hope this is not a duplicate question (did my best to see if it was already asked). I have a data frame and would like to count how many rows are identical.
df = data.frame(ID = c("id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9"),
Val1 = c("A", "B", "C", "A", "A", "B", "D", "C", "D"),
Val2 = c("B", "C", NA, "B", "B", "D", "E", "D", "E"),
Val3 = c("C", NA, NA, "C", "C", "B", NA, NA,NA),
Val4 = c("D", NA, NA, "E", "D", NA, NA, NA, NA))
> df
ID Val1 Val2 Val3 Val4
1 id1 A B C D
2 id2 B C <NA> <NA>
3 id3 C <NA> <NA> <NA>
4 id4 A B C E
5 id5 A B C D
6 id6 B D B <NA>
7 id7 D E <NA> <NA>
8 id8 C D <NA> <NA>
9 id9 D E <NA> <NA>
So for this example I expect that the return would be A B C D 2, D E 2, B C <NA> <NA> 1 and so on..
Tried with table but I get an Error in table(type_table) : attempt to make a table with >= 2^31 elements and my df has "only" ~140K rows. I want to apply this on a much larger dataset. Tried with summarise also but probably I do not know how to apply it correctly. Is aggregate an option? Thank you
The reason why table isn't working is because it treats each column separately and tries to find by element combinations instead of by row combinations.
You can try using the do.call(paste( combination in order to paste elements by row and run table over it
table(do.call(paste, df[-1]))
# A B C D A B C E B C NA NA B D B NA C D NA NA C NA NA NA D E NA NA
# 2 1 1 1 1 1 2
If table isn't efficient enough, we can try with .N from data.table instead
library(data.table)
setDT(df)[, .N, by = c(names(df)[-1])]
# Val1 Val2 Val3 Val4 N
# 1: A B C D 2
# 2: B C NA NA 1
# 3: C NA NA NA 1
# 4: A B C E 1
# 5: B D B NA 1
# 6: D E NA NA 2
# 7: C D NA NA 1
With data.table
library(data.table)
setDT(df)
df[, dups := 1:.N, setdiff(names(df), "ID")]
df[, .SD[.N], setdiff(names(df), c("ID", "dups"))][dups != 1]
Group by everything except ID, index items within groups of duplicates, then select the last row in each group (when the duplication index isn't 1).
I have a data.table that looks like this
> dput(DT)
A B C A B C D
1: 1 2 3 3 5 6 7
2: 2 1 3 2 1 3 4
Here's the dput
DT <- structure(list(A = 1:2, B = c(2L, 1L), C = c(3L, 3L), A = c(3L,
2L), B = c(5L, 1L), C = c(6L, 3L), D = c(7L, 4L)), .Names = c("A",
"B", "C", "A", "B", "C", "D"), row.names = c(NA, -2L), class = c("data.table",
"data.frame"))
Basically, I want to subset them according to their headers. So for header "B", I would do this:
subset(DT,,grep(unique(names(DT))[2],names(DT)))
B B
1: 2 2
2: 1 1
As you can see, the values are wrong as the second column is simply a repeat of the first. I want to get this instead:
B B
1: 2 5
2: 1 1
Can anyone help me please?
The following alternatives work for me:
pos <- grep("B", names(DT))
DT[, ..pos]
# B B
# 1: 2 5
# 2: 1 1
DT[, .SD, .SDcols = patterns("B")]
# B B
# 1: 2 5
# 2: 1 1
DT[, names(DT) %in% unique(names(DT))[2], with = FALSE]
# B B
# 1: 2 5
# 2: 1 1