how to extract blocks of rows without looping in r - r

I have a huge text file. I would like to extract out the blocks of rows which row indexes are defined in another data frame, such as sub. I have a loop script for it but I would like to find more efficient way (better without looping) for this task. Here is my toy example:
df <- data.frame(value=runif(10000, 0, 10^3))
df$idx <- 1:nrow(df)
sub <- data.frame(start=c(20,50,130,2000),end=c(25,60,150,2030))
sub_data <- data.frame()
for (j in 1:nrow(sub)){
dt <- df[df$idx >= sub$start[j] & df$idx <= sub$end[j],]
sub_data <- rbind(sub_data,dt)
}
sub_data

Here is one solution with data.table using non equi join available
since v1.9.8
library(data.table)
dt <- data.table(value=runif(10000, 0, 10^3))
# add index column
dt[, idx := seq_len(.N)]
# create subset table
sub <- data.table(start=c(20,50,130,2000),end=c(25,60,150,2030))
# use data.table non equijoin
dt1 <- dt[sub, on = .(idx >= start, idx <= end)]
head(dt1)
#> value idx idx.1
#> 1: 820.38637 20 25
#> 2: 262.51398 20 25
#> 3: 900.37408 20 25
#> 4: 74.91815 20 25
#> 5: 507.87825 20 25
#> 6: 547.45235 20 25
# use data.table non equi join but just keep column from dt
dt2 <- dt[sub, .(value, idx = x.idx), on = .(idx >= start, idx <= end)]
head(dt2)
#> value idx
#> 1: 820.38637 20
#> 2: 262.51398 21
#> 3: 900.37408 22
#> 4: 74.91815 23
#> 5: 507.87825 24
#> 6: 547.45235 25

Here is a solution creating sequence of all id, and then subset the df based on the sequence of id. df2 is the final output.
IDs <- unlist(lapply(1:nrow(sub), function(i) {sub$start[i]:sub$end[i]}))
df2 <- df[df$idx %in% IDs, ]
Or we can use functions from tidyverse.
library(tidyverse)
sub2 <- sub %>%
mutate(idx = map2(start, end, `:`)) %>%
unnest()
df2 <- df %>% semi_join(sub2, by = "idx")

Subset relevant portion of df for each row of sub such that the subgroups are in a list and then rbind the subgroups together
output = do.call(rbind, lapply(1:NROW(sub), function(i) with(sub, df[start[i]:end[i],])))
identical(sub_data, output)
#[1] TRUE

As you mention that you got a huge text file,
I suggest using data.table's fread and rbindlist functions to use
dt_div_conquer <- function(loc, id_name, subset_id){
# id_name : ID column in file - to be used for filtering
# subset_id : list of IDs to be filtered
# loc : file location
## Read ID Column from the txt file
v <- fread(sprintf('%s', loc), select = id_name)
## filter row numbers to read
v <- v[[id_name]] %in% subset_id
seq <- rle(v)
idx <- c(0, cumsum(seq$lengths))[which(seq$values)] + 1
## create starting row-number and length as a data-frame
indx <- data.frame(start=idx, length=seq$length[which(seq$values)])
## Apply fread with row-number and length details
result <- do.call(rbindlist,
apply(indx, 1, function(x) return(fread(sprintf('%s', loc),nrows= x[2],skip=x[1]))))
return(result)
}

Related

Is there a function in R to avoid using loop when we look for all matching index for all element of a vector?

I have this for loop which return the first matching index of every element in the vector
but it is very slow (nrow(data) > 50 000)
example:
id1 <- c(1,5,8,10)
id2 <- c(5,8,10,1)
data <- data.frame(id1,id2, idx = 1:length(id1))
results should be :
data$new_id
4 1 2 3
data$new_id <- NA
for(i in 1:nrow(data)){
data$new_id[i] <- which(data$id2 == data$id1[i])
}
I found that this works for small data frame but unfortunatly R return a "Error: cannot allocate vector of size 22.2 Gb"
A <- outer(data$id1,data$id2, "==")
data <- data %>%
mutate(new_id = which(t(A)),
id0 = 0:(nrow(data)-1),
new_id = new_id-(nrow(data))*id0)
Does other solution exist to do this indexing ?
We can use match which is very fast as a base R function. Here, we are just matching two column of a dataset without even trying to get both datasets together
with(data, match(id1, id2))
#[1] 4 1 2 3
To make this faster, use fmatch from fastmatch
library(fastmatch)
with(data, fmatch(id1, id2))
Benchmarks
set.seed(24)
data1 <- data.frame(id1 = sample(1e7), id2 = sample(1e7))
system.time(with(data1, match(id1, id2)))
# user system elapsed
# 1.635 0.079 1.691
system.time(with(data1, fmatch(id1, id2)))
# user system elapsed
# 1.155 0.062 1.195
system.time({
data2 <- data.table(id = data1$id1)
data3 <- data.table(id = data1$id2)
data2[data3, idx := .I, on = .(id)]
})
# user system elapsed
# 2.306 0.051 2.353
When using large datasets, you could try a data.table join (usually pretty fast). Should be even faster (on large sets) if you set keys first
library( data.table )
#make data.frames out of your vectors
dt1 <- data.table( id = id1 )
dt2 <- data.table (id = id2 )
#update join with indexnumbers from dt2 of dt1, matching id.
dt1[dt2, idx := .I, on = .(id)]
# id idx
# 1: 1 4
# 2: 5 1
# 3: 8 2
# 4: 10 3
NB: this only returns the first matching position!

Return the IDs of a matrix that has all elemets of a vector

I know the answer is completely easy but I could not get it so far. Also I tried to find the answer through the similar questions but i could not. Anyway, I need to return the ID of matrix m that has all elemets of a vector (NoN). In the example that I prepared at below, I need to return IDs 1 and 3.
Example:
m<-matrix(c(1,1,1,1,2,2,34,45,4,4,4,4,4,5,6,3,3,3,3,21,22,3425,345,65,22,42,65,86,456,454,5678,5,234,22,65,21,22,786),nrow=19)
colnames(m)<-c("ID","LO")
NoN<-c(21,22)
My attempts so far are as follow:
1: m[all(m[,2] %in% NoN),1]
2: m[match(NoN, m[,2]),1]
3: subset(m, m[,2] %in% NoN)
4: m[which(m[,2] %in% NoN),1]
Appreciate!
Here's a function using base R:
FOO <- function(m, NoN){
# split matrix based on ID column
m2 <- lapply(split(m, m[, 1]), function(x) matrix(x, ncol = 2))
# match every element of NoN, create logical matrix
matchresult <- do.call(cbind, lapply(lapply(m2, function(x) lapply(NoN, function(y) match(y, x[,2]))), unlist))
# print colnames (= ID) of columns with no NA
as.numeric(colnames(matchresult)[colSums(apply(matchresult, 2, is.na)) == 0])
}
Result of function call:
> FOO(m, NoN)
[1] 1 3
Untested except for your example, but this should be able to handle any length of NoN as well as duplicated combinations of ID and LO.
Edit: More concise and efficient variant provided by #docendodiscimus:
FOO <- function(m, NoN){
df <- as.data.frame(m)
unique(df[as.logical(ave(df$LO, df$ID, FUN = function(x) all(NoN %in% x))),"ID"])
}
A not so safe way using base R:
m<-matrix(c(1,1,1,1,2,2,34,45,4,4,4,4,4,5,6,3,3,3,3,21,22,3425,345,65,22,42,65,86,456,454,5678,5,234,22,65,21,22,786),nrow=19)
colnames(m)<-c("ID","LO")
NoN<-c(21,22)
IDs <- m[m[, 2] %in% NoN, 1]
IDs <- table(IDs)
IDs <- names(IDs)[IDs >= length(NoN)]
> IDs
[1] "1" "3"
But beware, this does not take duplicated values into account. So if ID 1 would have two LOs of value 21 but no 22, it would still return ID 1.
EDIT: A safe way using dplyr:
library(dplyr)
m <- data.frame(m)
IDs <- m %>%
slice(which(LO %in% NoN)) %>% # get all rows which contain values from NoN
group_by(ID) %>% # group by ID
summarise(uniques = n_distinct(LO)) %>% # count unique values per ID
filter(uniques == length(NoN)) %>% # number of unique values has to be the same as the number of values in NoN
select(ID) %>% # select ID columns
unlist() %>% # unlist it
as.numeric() # convert from named num to numeric
> IDs
[1] 1 3
Here's an alternative solution that saves matrix m as a dataframe and performs a process for each ID:
# example data
m<-matrix(c(1,1,1,1,2,2,34,45,4,4,4,4,4,5,6,3,3,3,3,21,22,3425,345,65,22,42,65,86,456,454,5678,5,234,22,65,21,22,786),nrow=19)
colnames(m)<-c("ID","LO")
NoN<-c(21,22)
library(dplyr)
data.frame(m) %>% # save m as dataframe
group_by(ID) %>% # for each ID
summarise(sum_flag = sum(LO %in% NoN)) %>% # count number of LO elements in NoN
filter(sum_flag == length(NoN)) %>% # keep rows where this number matches the length of NoN
pull(ID) # get the corresponding IDs
# [1] 1 3
Keep in mind that this process assumes (based on your example) that elements of NoN and rows of m are unique.
I took this answer from #docendo discimus, which I found it efficient and concise.
df <- as.data.frame(m);
unique(df[as.logical(ave(df$LO, df$ID, FUN = function(x) all(NoN %in% x))),"ID"])

Passing vector with multiple values into R function to generate data frame

I have a table, called table_wo_nas, with multiple columns, one of which is titled ID. For each value of ID there are many rows. I want to write a function that for input x will output a data frame containing the number of rows for each ID, with column headers ID and nobs respectively as below for x <- c(2,4,8).
## id nobs
## 1 2 1041
## 2 4 474
## 3 8 192
This is what I have. It works when x is a single value (ex. 3), but not when it contains multiple values, for example 1:10 or c(2,5,7). I receive the warning "In ID[counter] <- x : number of items to replace is not a multiple of replacement length". I've just started learning R and have been struggling with this for a week and have searched manuals, this site, Google, everything. Can someone help please?
counter <- 1
ID <- vector("numeric") ## contain x
nobs <- vector("numeric") ## contain nrow
for (i in x) {
r <- subset(table_wo_nas, ID %in% x) ## create subset for rows of ID=x
ID[counter] <- x ## add x to ID
nobs[counter] <- nrow(r) ## add nrow to nobs
counter <- counter + 1 } ## loop
result <- data.frame(ID, nobs) ## create data frame
In base R,
# To make a named vector, either:
tmp <- sapply(split(table_wo_nas, table_wo_nas$ID), nrow)
# OR just:
tmp <- table(table_wo_nas$ID)
# AND
# arrange into data.frame
nobs_df <- data.frame(ID = names(tmp), nobs = tmp)
Alternately, coerce the table into a data.frame directly, and rename:
nobs_df <- data.frame(table(table_wo_nas$ID))
names(nobs_df) <- c('ID', 'nobs')
If you only want certain rows, subset:
nobs_df[c(2, 4, 8), ]
There are many, many more options; these are just a few.
With dplyr,
library(dplyr)
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n())
If you only want certain IDs, add on a filter:
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n()) %>% filter(ID %in% c(2, 4, 8))
Seems pretty straightforward if you just use table again:
tbl <- table( table_wo_nas[ , 'ID'] )
data.frame( IDs = names(tbl), nobs= tbl)
Could also get a quick answer although with different column names using:
as.data.frame(table( table_wo_nas[ , 'ID'] ))
Try this.
x=c(2,4,8)
count_of_id=0
#df is your data frame table_wo_nas
count_of<-function(x)
{for(i in 1 : length(x))
{count_of_id[i]<-length(which(df$id==x[i])) #find out the n of rows for each unique value of x
}
df_1<-cbind(id,count_of_id)
return(df_1)
}

Matching vector values by records in a data frame in R

I have a vector of values r as follows:
r<-c(1,3,4,6,7)
and a data frame df with 20 records and two columns:
id<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,1,4,15,16,17,18,19,20)
freq<-c(1,3,2,4,5,6,6,7,8,3,3,1,6,9,9,1,1,4,3,7,7)
df<-data.frame(id,freq)
Using the r vector I need to extract a sample of records (in the form of a new data frame) from df in a way that the freq values of the records, would be equal to the values I have in my r vector. Needless to say that if it finds multiple records with the same freq values it should randomly pick one of them. For instance one possible outcome can be:
id frequency
12 1
10 3
4 4
7 6
8 7
I would be thankful if anyone could help me with this.
You could try data.table
library(data.table)
setDT(df)[freq %in% r,sample(id,1L) , freq]
Or using base R
aggregate(id~freq, df, subset=freq %in% r, FUN= sample, 1L)
Update
If you have a vector "r" with duplicate values and want to sample the data set ('df') based on the length of unique elements in 'r'
r <-c(1,3,3,4,6,7)
res <- do.call(rbind,lapply(split(r, r), function(x) {
x1 <- df[df$freq %in% x,]
x1[sample(1:nrow(x1),length(x), replace=FALSE),]}))
row.names(res) <- NULL
You can use filter and sample_n from "dplyr":
library(dplyr)
set.seed(1)
df %>%
filter(freq %in% r) %>%
group_by(freq) %>%
sample_n(1)
# Source: local data frame [5 x 2]
# Groups: freq
#
# id freq
# 1 12 1
# 2 10 3
# 3 17 4
# 4 13 6
# 5 8 7
Have you tried using the match() function or %in%? This might not be a fast/clean solution, but uses only base R functions:
rUnique <- unique(r)
df2 <- df[df$freq %in% rUnique,]
x <- data.frame(id = NA, freq = rUnique)
for (i in 1:length(rUnique)) {
x[i,1] <- sample(df2[df2[, 2] == rUnique[i], 1], 1)
}
print(x)

Aggregating data frame rows using an input vector

I have this toy data.frame:
df = data.frame(id = c("a","b","c","d"), value = c(2,3,6,5))
and I'd like to aggregate its rows according to this toy vector:
collapsed.ids = c("a,b","c","d")
where the aggregated data.frame should keep max(df$value) of its aggregated rows.
So for this toy example the output would be:
> aggregated.df
id value
1 a,b 3
2 c 6
3 d 5
I should note that my real data.frame is ~150,000 rows
I would use data.table for this.
Something like the following should work:
library(data.table)
DT <- data.table(df, key = "id") # Main data.table
Key <- data.table(ind = collapsed.ids) # your "Key" table
## We need your "Key" table in a long form
Key <- Key[, list(id = unlist(strsplit(ind, ",", fixed = TRUE))), by = ind]
setkey(Key, id) # Set the key to facilitate a merge
## Merge and aggregate in one step
DT[Key][, list(value = max(value)), by = ind]
# ind value
# 1: a,b 3
# 2: c 6
# 3: d 5
You don't need data.table, you can just use base R.
split.ids <- strsplit(collapsed.ids, ",")
split.df <- data.frame(id = tmp <- unlist(split.ids),
joinid = rep(collapsed.ids, sapply(split.ids, length)))
aggregated.df <- aggregate(value ~ id, data = merge(df, split.df), max)
Result:
# id value
# 1 a,b 3
# 2 c 6
# 3 d 5
Benchmark
df <- df[rep(1:4, 50000), ] # Make a big data.frame
system.time(...) # of the above code
# user system elapsed
# 1.700 0.154 1.947
EDIT: Apparently Ananda's code runs in 0.039, so I'm eating crow. But either are acceptable for this size.

Resources