I need to create a column with unique ID, basically add the row number as an own column. My current data frame looks like this:
V1 V2
1 23 45
2 45 45
3 56 67
How to make it look like this:
V1 V2 V3
1 23 45
2 45 45
3 56 67
?
Many thanks
Two tidyverse alternatives (using sgibb's example data):
tibble::rowid_to_column(d, "ID")
which gives:
ID V1 V2
1 1 23 45
2 2 45 45
3 3 56 67
Or:
dplyr::mutate(d, ID = row_number())
which gives:
V1 V2 ID
1 23 45 1
2 45 45 2
3 56 67 3
As you can see, the rowid_to_column-function adds the new column in front of the other ones while the mutate&row_number()-combo adds the new column after the others.
And another base R alternative:
d$ID <- seq_along(d[,1])
You could use cbind:
d <- data.frame(V1=c(23, 45, 56), V2=c(45, 45, 67))
## enter id here, you could also use 1:nrow(d) instead of rownames
id <- rownames(d)
d <- cbind(id=id, d)
## set colnames to OP's wishes
colnames(d) <- paste0("V", 1:ncol(d))
EDIT: Here a comparison of #dacko suggestions. d$id <- seq_len(nrow(d) is slightly faster, but the order of the columns is different (id is the last column; reorder them seems to be slower than using cbind):
library("microbenchmark")
set.seed(1)
d <- data.frame(V1=rnorm(1e6), V2=rnorm(1e6))
cbindSeqLen <- function(x) {
return(cbind(id=seq_len(nrow(x)), x))
}
dickoa <- function(x) {
x$id <- seq_len(nrow(x))
return(x)
}
dickoaReorder <- function(x) {
x$id <- seq_len(nrow(x))
nc <- ncol(x)
x <- x[, c(nc, 1:(nc-1))]
return(x)
}
microbenchmark(cbindSeqLen(d), dickoa(d), dickoaReorder(d), times=100)
# Unit: milliseconds
# expr min lq median uq max neval
# cbindSeqLen(d) 23.00683 38.54196 40.24093 42.60020 47.73816 100
# dickoa(d) 10.70718 36.12495 37.58526 40.22163 72.92796 100
# dickoaReorder(d) 19.25399 68.46162 72.45006 76.51468 88.99620 100
Many presented their ideas, but I think this is the sortest and simplest code for this task:
data$ID <- 1:nrow(data)
One line. The one and only.
You could also do this using dplyr:
DF <- mutate(DF, id = rownames(DF))
data.table solution
Easier syntax and much faster
library(data.table)
dt <- data.table(V1=c(23, 45, 56), V2=c(45, 45, 67))
setnames(dt, c("V2", "V3")) # changing column names
dt[, V1 := .I] # Adding ID column
Hope this will help. Shortest and best way to create ID column is:
dataframe$ID <- seq.int(nrow(dataframe))
If you're starting without named rows in your df, the tidy way is:
df %>%
mutate(id = row_number()) %>%
select(id, everything())
Here is a solution that keeps the dplyr piping format and places id in the first column, which may be preferred.
d %>%
mutate(id = rownames(.)) %>%
select(id, everything())
The function rownames_to_column() moves rownames into a column; found in the tidyverse package (docs).
rownames_to_column(DF, "my_column_name")
Use column_to_rownames() for the reverse operation.
If your database is not too large this will work
# Load sample data
Dt1 <- tibble(V1=c(23,45,56),V2=c(45,45,67))
# Create Separate Tibble with row numbers
Dt2 <- tibble(id=seq(1:nrow(Dt1)))
# Join together
Dt3 <- cbind(Dt2,Dt1)
Related
I have a huge text file. I would like to extract out the blocks of rows which row indexes are defined in another data frame, such as sub. I have a loop script for it but I would like to find more efficient way (better without looping) for this task. Here is my toy example:
df <- data.frame(value=runif(10000, 0, 10^3))
df$idx <- 1:nrow(df)
sub <- data.frame(start=c(20,50,130,2000),end=c(25,60,150,2030))
sub_data <- data.frame()
for (j in 1:nrow(sub)){
dt <- df[df$idx >= sub$start[j] & df$idx <= sub$end[j],]
sub_data <- rbind(sub_data,dt)
}
sub_data
Here is one solution with data.table using non equi join available
since v1.9.8
library(data.table)
dt <- data.table(value=runif(10000, 0, 10^3))
# add index column
dt[, idx := seq_len(.N)]
# create subset table
sub <- data.table(start=c(20,50,130,2000),end=c(25,60,150,2030))
# use data.table non equijoin
dt1 <- dt[sub, on = .(idx >= start, idx <= end)]
head(dt1)
#> value idx idx.1
#> 1: 820.38637 20 25
#> 2: 262.51398 20 25
#> 3: 900.37408 20 25
#> 4: 74.91815 20 25
#> 5: 507.87825 20 25
#> 6: 547.45235 20 25
# use data.table non equi join but just keep column from dt
dt2 <- dt[sub, .(value, idx = x.idx), on = .(idx >= start, idx <= end)]
head(dt2)
#> value idx
#> 1: 820.38637 20
#> 2: 262.51398 21
#> 3: 900.37408 22
#> 4: 74.91815 23
#> 5: 507.87825 24
#> 6: 547.45235 25
Here is a solution creating sequence of all id, and then subset the df based on the sequence of id. df2 is the final output.
IDs <- unlist(lapply(1:nrow(sub), function(i) {sub$start[i]:sub$end[i]}))
df2 <- df[df$idx %in% IDs, ]
Or we can use functions from tidyverse.
library(tidyverse)
sub2 <- sub %>%
mutate(idx = map2(start, end, `:`)) %>%
unnest()
df2 <- df %>% semi_join(sub2, by = "idx")
Subset relevant portion of df for each row of sub such that the subgroups are in a list and then rbind the subgroups together
output = do.call(rbind, lapply(1:NROW(sub), function(i) with(sub, df[start[i]:end[i],])))
identical(sub_data, output)
#[1] TRUE
As you mention that you got a huge text file,
I suggest using data.table's fread and rbindlist functions to use
dt_div_conquer <- function(loc, id_name, subset_id){
# id_name : ID column in file - to be used for filtering
# subset_id : list of IDs to be filtered
# loc : file location
## Read ID Column from the txt file
v <- fread(sprintf('%s', loc), select = id_name)
## filter row numbers to read
v <- v[[id_name]] %in% subset_id
seq <- rle(v)
idx <- c(0, cumsum(seq$lengths))[which(seq$values)] + 1
## create starting row-number and length as a data-frame
indx <- data.frame(start=idx, length=seq$length[which(seq$values)])
## Apply fread with row-number and length details
result <- do.call(rbindlist,
apply(indx, 1, function(x) return(fread(sprintf('%s', loc),nrows= x[2],skip=x[1]))))
return(result)
}
I would like to conditionally merge two tables with the following formats:
id1 <- c('S001', 'S002', 'S003', 'S004', 'S004')
id2 <- c('S001', 'S001', 'S002', 'S002', 'S001')
ids <- data.frame(id1, id2)
and
bad_id_key <- c('S002', 'S004')
bad_id_val <- c('a', 'b')
bad_ids <- data.frame(bad_id_key, bad_id_val)
The conditional rules are:
If both IDs are in the "bad" list, drop that row
If neither ID is in the "bad" list, drop that row
If only one of the IDs is bad, add the bad value to the row.
The resulting table would look like:
id1 id2 bad_id_val
2 S002 S001 a
3 S003 S002 a
5 S004 S001 b
I was able to accomplish this with the following code snippet:
conditionalJoin <- function(row){
if(row$id1 %in% bad_id_key & row$id2 %in% bad_id_key){
# do nothing
}
else if(row$id1 %in% bad_id_key){
merge(x=row, y=bad_ids, by.x="id1", by.y="bad_id_key", all.x=TRUE)
}
else if(row$id2 %in% bad_id_key){
merge(x=row, y=bad_ids, by.x="id2", by.y="bad_id_key", all.x=TRUE)
}
}
out <- do.call("rbind", as.list(by(ids, 1:nrow(ids), conditionalJoin)))
However, this approach scales extremely poorly as the size of the ids dataframe grows. I think this is because of the rbind function. Also, the if elses are not very elegant R code.
Does anyone know of of an R command to do this kind of row-wise conditional joining that is more efficient than rbind? Thanks in advance.
Using the data.table package, i would approach it as follows:
library(data.table)
ids <- setDT(ids)[xor(id1 %in% bad_ids$bad_id_key, id2 %in% bad_ids$bad_id_key)
][, bad_id_val := ifelse(id1 %in% bad_ids$bad_id_key,
as.character(bad_ids$bad_id_val[match(id1, bad_ids$bad_id_key)]),
as.character(bad_ids$bad_id_val[match(id2, bad_ids$bad_id_key)]))]
which gives the desired result:
> ids
id1 id2 bad_id_val
1: S002 S001 a
2: S003 S002 a
3: S004 S001 b
Tested on the larger dataset of #jeremycg this gives the following outcome with regard to speed:
Unit: milliseconds
expr min lq mean median uq max neval cld
jeremy 9.196898 9.386950 9.854132 9.603002 9.749256 16.764747 100 b
OP 974.933816 985.813821 996.770067 992.145890 1000.411484 1143.402837 100 c
jaap 3.572531 3.612401 3.779686 3.679115 3.790707 9.803782 100 a
This is the fastest I can get it using dplyr. It's considerably faster, as there are only two match calls, everything else is quick. See the benchmark below.
library(dplyr)
ids %>% mutate(x = match(id1, bad_ids$bad_id_key), #get the first match of id1
y = match(id2, bad_ids$bad_id_key)) %>% #and id2
filter(xor(is.na(x), is.na(y))) %>% #filter to make sure we have 1 match
mutate(val = ifelse(is.na(x), #if x didn't match
as.character(bad_ids$bad_id_val[y]), #get the y
as.character(bad_ids$bad_id_val[x]))) # otherwise get the x
Here's a benchmark on larger data:
#5000 lines of ids
set.seed(12345)
ids <- data.frame(id1 = sample(1:50, 5000, replace = TRUE), id2 = sample(1:50, 5000, replace = TRUE))
bad_ids <- data.frame(bad_id_key = 1:20, bad_id_val = letters[1:20])
microbenchmark::microbenchmark(
me = {
ids %>% mutate(x = match(id1, bad_ids$bad_id_key),
y = match(id2, bad_ids$bad_id_key)) %>%
filter(xor(is.na(x), is.na(y))) %>%
mutate(val = ifelse(is.na(x),
as.character(bad_ids$bad_id_val[y]),
as.character(bad_ids$bad_id_val[x])))},
OP = {out <- do.call("rbind", as.list(by(ids, 1:nrow(ids), conditionalJoin)))}
)
Unit: milliseconds
expr min lq mean median uq max
me 11.92924 12.41934 15.36524 13.07722 15.71085 63.14211
OP 1831.34599 1910.90149 2369.70980 2112.57251 2340.88428 5549.01191
neval
100
100
Instead of using ifelse functions, it is often better to just work within the data.frame or data.table its self to identify records you want to keep. For your example you could do this with the following code:
ids[xor(ids$id1 %in% bad_id_key, ids$id2 %in% bad_id_key),]
After running this code you just need to merge ids and bad_ids to append the bad id value.
I am attempting to replace an inefficient nested for loop that will not run on a large dataset with the apply function.
unique <- cbind.data.frame(c(1,2,3))
colnames(unique) <- "note"
ptSeensub <- rbind.data.frame(c(1,"a"), c(1,"b"), c(2,"a"), c(2,"d"), c(3,"e"), c(3,"f"))
colnames(ptSeenSub) <- c("PARENT_EVENT_ID", "USER_NAME")
uniqueRow <- nrow(unique)
ptSeenSubRow <- nrow(ptSeenSubRow)
for (note in 1:uniqueRow)
{
for (row in 1:ptSeenSubRow)
{
if (ptSeenSub$PARENT_EVENT_ID[row] == unique$note[note])
{
unique$attending_name[note] <- ptSeenSub$USER_NAME[row]
unique$attending_name[note] <- ptSeenSub$USER_NAME[row +1]
}
}
}
I would like the results to be similar to this dataframe:
results <- rbind.data.frame(c(1, "a", "b"), c(2, "a", "d"), c(3,"e", "f"))
colnames(results) <- c("note", "attending_name", "resident_name")
The loop will be running over millions of rows and will not finish. How can I vectorize this to finish over large data sets? Any advice is greatly apprecaited
Sounds like you are trying to reshape data into wide format. I find that dplyr and tidyr find nice tools to accomplish this.
define data
library(tidyr)
library(dplyr)
ptSeenSub <- rbind.data.frame(c(1,"a"), c(1,"b"), c(2,"a"), c(2,"d"), c(3,"e"), c(3,"f"))
reshape
result <- ptSeenSub %>%
group_by(PARENT_EVENT_ID) %>%
mutate(k = row_number()) %>%
spread(k, USER_NAME)
You can then change names if you wish:
names(result) <- c("notes", "attending_name", "resident_name")
You could also use dcast from either reshape2 or the devel version of data.table (should be fast) i.e. v1.9.5
library(data.table)
setnames(dcast(setDT(ptSeensub)[, N:= 1:.N, PARENT_EVENT_ID],
PARENT_EVENT_ID~N, value.var='USER_NAME'),
c('note', 'attending_name', 'resident_name'))[]
# note attending_name resident_name
#1: 1 a b
#2: 2 a d
#3: 3 e f
If there are only two observations per each 'PARENT_EVENT_ID'
setDT(ptSeensub)[,.(attending_name=USER_NAME[1L],
resident_name=USER_NAME[2L]) , .(note=PARENT_EVENT_ID)]
# note attending_name resident_name
#1: 1 a b
#2: 2 a d
#3: 3 e f
At one stage in longer chain of dplyr functions, I need to replace parts of a variable using numeric indices to specify which elements to replace.
My data looks like this:
df1 <- data.frame(grp = rep(1:2, each = 3),
a = 1:6,
b = rep(c(10, 20), each = 3))
df1
# grp a b
# 1 1 1 10
# 2 1 2 10
# 3 1 3 10
# 4 2 4 20
# 5 2 5 20
# 6 2 6 20
Assume that we, within each group, wish to replace elements in variable a with the corresponding elements in b, at one or more positions. In this simple example I use a single index (id), but this could be a vector of indices. First, here's how I would do it with ddply:
library(plyr)
id <- 2
ddply(.data = df1, .variables = .(grp), function(x){
x$a[id] <- x$b[id]
x
})
# grp a b
# 1 1 1 10
# 2 1 10 10
# 3 1 3 10
# 4 2 4 20
# 5 2 20 20
# 6 2 6 20
In dplyr I could think of some different ways to perform the replacement. (1) Use do with an anonymous function, similar to the one used in ddply. (2) Use mutate: concatenate a vector where the replacement is 'inserted' using numeric indexing. This is probably only fruitful for a single index. (3) Use mutate: create an index vector and use conditional replacement with ifelse (see e.g. here, here, here, and here).
detach("package:plyr", unload = TRUE)
library(dplyr)
# (1)
fun_do <- function(df){
l <- df %.%
group_by(grp) %.%
do(function(dat){
dat$a[id] <- dat$b[id]
dat
})
do.call(rbind, l)
}
# (2)
fun_mut <- function(df){
df %.%
group_by(grp) %.%
mutate(
a = c(a[1:(id - 1)], b[id], a[(id + 1):length(a)])
)
}
# (3)
fun_mut_ifelse <- function(df){
df %.%
group_by(grp) %.%
mutate(
idx = 1:n(),
a = ifelse(idx %in% id, b, a)) %.%
select(-idx)
}
fun_do(df1)
fun_mut(df1)
fun_mut_ifelse(df1)
In a benchmark with a slightly larger data set, the 'jigsaw puzzle insertion' is fastest, but again, this method is probably only suited for single replacements. And it doesn't look very clean...
set.seed(123)
df2 <- data.frame(grp = rep(1:200, each = 3),
a = rnorm(600),
b = rnorm(600))
library(microbenchmark)
microbenchmark(fun_do(df2),
fun_mut(df2),
fun_mut_ifelse(df2),
times = 10)
# Unit: microseconds
# expr min lq median uq max neval
# fun_do(df2) 48443.075 49912.682 51356.631 53369.644 55108.769 10
# fun_mut(df2) 891.420 933.996 1019.906 1066.663 1155.235 10
# fun_mut_ifelse(df2) 2503.579 2667.798 2869.270 3027.407 3138.787 10
Just to check the influence of the do.call(rbind part in the do function, try without it:
fun_do2 <- function(df){
df %.%
group_by(grp) %.%
do(function(dat){
dat$a[2] <- dat$b[2]
dat
})
}
fun_do2(df1)
Then a new benchmark on a larger data set:
df3 <- data.frame(grp = rep(1:2000, each = 3),
a = rnorm(6000),
b = rnorm(6000))
microbenchmark(fun_do(df3),
fun_do2(df3),
fun_mut(df3),
fun_mut_ifelse(df3),
times = 10)
Again, a simple 'insertion' is fastest, while the do function is losing ground. In the help text do is described as "a general purpose complement" to the other dplyr functions. To me it seemed to be a natural choice for an anonymous function. However, I was surprised that do was so much slower, also when the non-dplyr rbinding part was skipped. Currently, the do documentation is rather scarce, so I wonder if I am abusing the function, and that there may be more appropriate (undocumented?) ways to do it?
I got no hits on index/indices when I searched the dplyr help text or vignette. So now I wonder:
Are there other dplyr methods to replace parts of a variable using numeric indices which I have overlooked? Specifically, is the creation of an index column in combination with ifelse the way to go, or are there more direct a[i] <- b[i]-like alternatives?
Edit following comment from #G.Grothendieck (Thanks!). Added replace alternative (a candidate for 'See also' in ?[).
fun_replace <- function(df){
df %.%
group_by(grp) %.%
mutate(
a = replace(a, id, b[id]))
}
fun_replace(df1)
microbenchmark(fun_do(df3),
fun_do2(df3),
fun_mut(df3),
fun_mut_ifelse(df3),
fun_replace(df3),
times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun_do(df3) 685.154605 693.327160 706.055271 712.180410 851.757790 10
# fun_do2(df3) 291.787455 294.047747 297.753888 299.624730 302.368554 10
# fun_mut(df3) 5.736640 5.883753 6.206679 6.353222 7.381871 10
# fun_mut_ifelse(df3) 24.321894 26.091049 29.361553 32.649924 52.981525 10
# fun_replace(df3) 4.616757 4.748665 4.981689 5.279716 5.911503 10
replace function is fastest, and for sure easier to use than fun_mut when there are more than one index.
Edit 2 fun_do and fun_do2 no longer works in dplyr 0.2; Error: Results are not data frames at positions:
Here's a much faster modify-in-place approach:
library(data.table)
# select rows we want, then assign b to a for those rows, in place
fun_dt = function(dt) dt[dt[, .I[id], by = grp]$V1, a := b]
# benchmark
df4 = data.frame(grp = rep(1:20000, each = 3),
a = rnorm(60000),
b = rnorm(60000))
dt4 = as.data.table(df4)
library(microbenchmark)
# using fastest function from OP
microbenchmark(fun_dt(dt4), fun_replace(df4), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dt(dt4) 15.62325 17.22828 18.42445 20.83768 21.25371 10
# fun_replace(df4) 99.03505 107.31529 116.74830 188.89134 286.50199 10
I have two data frames with 2 columns in each. For example:
df.1 = data.frame(col.1 = c("a","a","a","a","b","b","b","c","c","d"), col.2 = c("b","c","d","e","c","d","e","d","e","e"))
df.2 = data.frame(col.1 = c("b","b","b","a","a","e"), col.2 = c("a","c","e","c","e","c"))
and I'm looking for an efficient way to look up the row index in df.2 of every col.1 col.2 row pair of df.1. Note that a row pair in df.1 may appear in df.2 in reverse order (for example df.1[1,], which is "a","b" appears in df.2[1,] as "b","a"). That doesn't matter to me. In other words, as long as a row pair in df.1 appears in any order in df.2 I want its row index in df.2, otherwise it should return NA. One more note, row pairs in both data frames are unique - meaning each row pair appears only once.
So for these two data frames the return vector would be:
c(1,4,NA,5,2,NA,3,NA,6,NA)
Maybe something using dplyr package:
first make the reference frame
use row_number() to number as per row index efficiently.
use select to "flip" the column vars.
two halves:
df_ref_top <- df.2 %>% mutate(n=row_number())
df_ref_btm <- df.2 %>% select(col.1=col.2, col.2=col.1) %>% mutate(n=row_number())
then bind together:
df_ref <- rbind(df_ref_top,df_ref_btm)
Left join and select vector:
gives to get your answer
left_join(df.1,df_ref)$n
# Per #thelatemail's comment, here's a more elegant approach:
match(apply(df.1,1,function(x) paste(sort(x),collapse="")),
apply(df.2,1,function(x) paste(sort(x),collapse="")))
# My original answer, for reference:
# Check for matches with both orderings of df.2's columns
match.tmp = cbind(match(paste(df.1[,1],df.1[,2]), paste(df.2[,1],df.2[,2])),
match(paste(df.1[,1],df.1[,2]), paste(df.2[,2],df.2[,1])))
# Convert to single vector of match indices
match.index = apply(match.tmp, 1,
function(x) ifelse(all(is.na(x)), NA, max(x, na.rm=TRUE)))
[1] 1 4 NA 5 2 NA 3 NA 6 NA
Here's a little function that tests a few of the looping options in R (which was not really intentional, but it happened).
check.rows <- function(data1, data2)
{
df1 <- as.matrix(data1);df2 <- as.matrix(data2);ll <- vector('list', nrow(df1))
for(i in seq(nrow(df1))){
ll[[i]] <- sapply(seq(nrow(df2)), function(j) df2[j,] %in% df1[i,])
}
h <- sapply(ll, function(x) which(apply(x, 2, all)))
sapply(h, function(x) ifelse(is.double(x), NA, x))
}
check.rows(df.1, df.2)
## [1] 1 4 NA 5 2 NA 3 NA 6 NA
And here's a benchmark when row dimensions are increased for both df.1 and df.2. Not too bad I guess, considering the 24 checks on each of 40 rows.
> dim(df.11); dim(df.22)
[1] 40 2
[1] 24 2
> f <- function() check.rows(df.11, df.22)
> microbenchmark(f())
## Unit: milliseconds
## expr min lq median uq max neval
## f() 75.52258 75.94061 76.96523 78.61594 81.00019 100
1) sort/merge First sort df.2 creating df.2.s and append a row number column. Then merge this new data frame with df.1 (which is already sorted in the question):
df.2.s <- replace(df.2, TRUE, t(apply(df.2, 1, sort)))
df.2.s$row <- 1:nrow(df.2.s)
merge(df.1, df.2.s, all.x = TRUE)$row
The result is:
[1] 1 4 NA 5 2 NA 3 NA 6 NA
2) sqldf Since dot is an SQL operator rename the data frames as df1 and df2. Note that for the same reason the column names will be transformed to col_1 and col_2 when df1 and df2 are automatically uploaded to the backend database. We sort df2 using min and max and left join it to df1 (which is already sorted):
df1 <- df.1
df2 <- df.2
library(sqldf)
sqldf("select b.rowid row
from df1
left join
(select min(col_1, col_2) col_1, max(col_1, col_2) col_2 from df2) b
using (col_1, col_2)")$row
REVISED Some code improvements. Added second solution.