apply similar variable to multiple dataset in r - r

I have 6 data named from dat1 to dat6, I want to add variable region and label them in a similar way, like this:
dat1$region <- paste("NE-1")
dat2$region <- paste("NE-2")
dat3$region <- paste("NE-3")
dat4$region <- paste("NE-4")
dat5$region <- paste("NE-5")
How can I write this code in a more concise way? using apply or for-loop?
Thanks!!

One option is to use get and assign functions in a for-loop.
Sample data:
dat1 <- data.frame(id=1:4, region = letters[1:4])
dat2 <- data.frame(id=5:8, region = letters[5:8])
dat3 <- data.frame(id=9:12, region = letters[9:12])
dat4 <- data.frame(id=13:16, region = letters[13:16])
dat5 <- data.frame(id=17:20, region = letters[17:20])
dat1
# id region
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
Apply for-loop:
for(i in 1:5){
name = paste("dat",i,sep="")
temp <- get(name)
temp$region = paste("NE",i,sep = "-")
assign(name, temp)
}
Verify results:
dat1
# id region
# 1 1 NE-1
# 2 2 NE-1
# 3 3 NE-1
# 4 4 NE-1
dat5
# id region
# 1 17 NE-5
# 2 18 NE-5
# 3 19 NE-5
# 4 20 NE-5

Keep all dataframes in a list then use lapply:
# example dataframes
dat1 <- cars[1:2, ]
dat2 <- cars[3:4, ]
dat3 <- cars[5:6, ]
myList <- list(dat1, dat2, dat3)
# myList
# [[1]]
# speed dist
# 1 4 2
# 2 4 10
#
# [[2]]
# speed dist
# 3 7 4
# 4 7 22
#
# [[3]]
# speed dist
# 5 8 16
# 6 9 10
Then it is easier to do repetitive operations. Loop through the list, add region column:
res <- lapply(seq_along(myList), function(i){
x <- myList[[ i ]]
x$region <- paste0("NE-", i)
x
})
res
# [[1]]
# speed dist region
# 1 4 2 NE-1
# 2 4 10 NE-1
#
# [[2]]
# speed dist region
# 3 7 4 NE-2
# 4 7 22 NE-2
#
# [[3]]
# speed dist region
# 5 8 16 NE-3
# 6 9 10 NE-3

How about this (assuming all your items start with dat and end with a unique identifier string):
dat_names <- ls()[grepl("^dat", ls())]
dat_ID <- sapply(dat_names, function(d) gsub("dat", "", d))
for(d in 1:length(dat_names)) {
dat_names[[d]]$region <- paste("NE-", dat_ID[d], sep="")
}

Related

Custom bins per group in dataframe - R function

I would like to create bins for the variable numbers per category name inside a function. But I am having difficulties related to using the category name provided as parameter inside the function. Maybe a data.table approach would be better.
set.seed(10)
b<-(rnorm(10, sd=1,mean=10))
y<-runif(3)
pr<-y/sum(y)
names<-unlist(lapply(mapply(rep, LETTERS[1:3], 1:3), function (x) paste0(x, collapse = "") ) )
x <- sample(names, 10, replace=TRUE, prob=pr)
df<-data.frame(name=x,numbers=b)
df
#working without bin limits per category (not desired)
#and using "numbers" in cut (not desired)
binfunction1 <- function(df, colgroup1, varcount,binsize) {
new<-df %>%
group_by_(colgroup1) %>%
mutate(bin = cut(numbers, breaks <- c(seq(7, 15, by = binsize)), # limits by colgroup not implemented
labels = 1:(length(breaks)-1) ) )
return(new)
}
binfunction1(df,"name","numbers",0.5)
name numbers bin
<fctr> <dbl> <fctr>
1 BB 10.018746 7
2 A 9.815747 6
3 CCC 8.628669 4
4 CCC 9.400832 5
5 BB 10.294545 7
6 CCC 10.389794 7
7 A 8.791924 4
8 A 9.636324 6
9 A 8.373327 3
10 A 9.743522 6
Not the most elegant solution, but is the outcome something you are after? (I didn't quite understand your question)
binfunction3 <- function(x, colgroup1, varcount, binsize) {
tmp <- split(x, x[[colgroup1]], drop = TRUE)
tp <- lapply(tmp, function(k) {
breaks <- c(seq(min(k[[varcount]])*0.9, max(k[[varcount]])*1.1, by = binsize))
cbind(k, data.frame(bin = cut(k[[varcount]], breaks, labels = 1:(length(breaks)-1))))
})
tp <- do.call(rbind, tp)
rownames(tp) <- gsub("[[:alpha:]]*\\.", "", rownames(tp))
return(tp[rownames(x),])
}
binfunction3(df,"name","numbers",0.5)
# name numbers bin
# 1 A 10.018746 5
# 2 CCC 9.815747 5
# 3 CCC 8.628669 2
# 4 BB 9.400832 2
# 5 A 10.294545 6
# 6 BB 10.389794 4
# 7 A 8.791924 3
# 8 CCC 9.636324 4
# 9 A 8.373327 2
# 10 A 9.743522 5
My answer based on Mikko's, but allowing better control over min and max limits of breaks and bin size.
binfunctionnew <- function(x, colgroup, varcount, binexp) {
tmp <- split(x, x[colgroup], drop = TRUE)
tp <- lapply(tmp, function(k) {
bin<-cut(k[,varcount],
breaks=c(seq(min(k[,varcount])*(1-10^(-(binexp+1))),
max(k[,varcount])*(1+10^(-(binexp-2))),
by = 10^(-(binexp))) ) , labels=F)
cbind (k, data.frame(bin = bin))
} )
tp <- do.call(rbind, tp)
return(tp)
}
#example or
binfunctionnew(df,"name","numbers",1) binfunctionnew(df,"name","numbers",0)
# name numbers bin name numbers bin
# A.1 A 10.018746 18 A.1 A 10.018746 3
# A.5 A 10.294545 21 A.5 A 10.294545 3
# A.7 A 8.791924 6 A.7 A 8.791924 2
# A.9 A 8.373327 1 A.9 A 8.373327 1
# A.10 A 9.743522 15 A.10 A 9.743522 3
# BB.4 BB 9.400832 1 BB.4 BB 9.400832 1
# BB.6 BB 10.389794 11 BB.6 BB 10.389794 2
# CCC.2 CCC 9.815747 13 CCC.2 CCC 9.815747 3
# CCC.3 CCC 8.628669 1 CCC.3 CCC 8.628669 1
# CCC.8 CCC 9.636324 11 CCC.8 CCC 9.636324 2

R - Collapse into vector same member of a list

I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}

How to permut list level in R

is there's a way to permute list levels in R?
In other words, how can i simply go from test list to test2?
test=list(
a=list("alpha"=1:2,"beta"=3:5),
b=list("alpha"=5:6,"omega"=7:10)
)
test2 <- list(
alpha=list("a"=1:2,"b"=5:6),
beta=list("a"=3:5),
omega=list("b"=7:10)
)
How about this:
tst <- unlist(test, recursive = FALSE)
lst <- split(tst, gsub("[a-z]\\.|[0-9]", "", names(tst)))
lapply(lst, function(z) setNames(z, substring(names(z), 1, 1)))
# $alpha
# $alpha$a
# [1] 1 2
#
# $alpha$b
# [1] 5 6
#
#
# $beta
# $beta$a
# [1] 3 4 5
#
#
# $omega
# $omega$b
# [1] 7 8 9 10

column bind in R and name the column

I want to column bind (cbind) mydf[,"c"] and give it a new name newcolumn in one step and get the result matrix mydf. How do I do it?
mydf
# a b c
# 1 2 6
# 1 3 4
mydf
# a b c newcolumn
# 1 2 6 6
# 1 3 4 4
You can specify the new column name in the call to cbind:
mydf <- cbind(mydf, newcolumn=mydf[,"c"])
mydf
# a b c newcolumn
# [1,] 1 2 6 6
# [2,] 1 3 4 4
Data (constructed with the same approach):
mydf <- cbind(a=c(1, 1), b=c(2, 3), c=c(6, 4))
If you had a data frame instead of a matrix, you could simply do mydf$newcolumn <- mydf$c.
There are many approaches you could take here:
mydf <- data.frame(a=c(1,1),b=c(2,3),c=c(6,4));
mydf;
## a b c
## 1 1 2 6
## 2 1 3 4
data.frame(mydf,newcolumn=mydf$c);
## a b c newcolumn
## 1 1 2 6 6
## 2 1 3 4 4
cbind(mydf,newcolumn=mydf$c);
## a b c newcolumn
## 1 1 2 6 6
## 2 1 3 4 4
transform(mydf,newcolumn=c);
## a b c newcolumn
## 1 1 2 6 6
## 2 1 3 4 4
within(mydf,newcolumn <- c);
## a b c newcolumn
## 1 1 2 6 6
## 2 1 3 4 4
mydf$newcolumn <- mydf$c;
mydf;
## a b c newcolumn
## 1 1 2 6 6
## 2 1 3 4 4
Of the 5 approaches shown above, only the last actually modifies mydf. For the other 4, you have to assign mydf to the return value to replace it with the new data.frame that has the additional column.
library('microbenchmark');
bind.df <- function() mydf <- data.frame(mydf,newcolumn=mydf$c);
bind.cb <- function() mydf <- cbind(mydf,newcolumn=mydf$c);
bind.tr <- function() mydf <- transform(mydf,newcolumn=c);
bind.wi <- function() mydf <- within(mydf,newcolumn <- c);
bind.as1 <- function() mydf$newcolumn <- mydf$c;
bind.as2 <- function() mydf['newcolumn'] <- mydf['c'];
bind.as3 <- function() mydf[,'newcolumn'] <- mydf[,'c'];
bind.as4 <- function() mydf[['newcolumn']] <- mydf[['c']];
N <- 1e5; mydf <- data.frame(a=rep(c(1,1),N),b=rep(c(2,3),N),c=rep(c(6,4),N));
microbenchmark(bind.df(),bind.cb(),bind.tr(),bind.as1(),bind.as2(),bind.as3(),bind.as4(),times=1e4);
## Unit: microseconds
## expr min lq mean median uq max neval
## bind.df() 97.077 112.046 128.66080 121.027 134.711 1690.513 10000
## bind.cb() 86.814 100.927 117.14364 109.907 122.737 1849.172 10000
## bind.tr() 105.203 120.171 138.90802 131.290 145.830 1680.250 10000
## bind.as1() 12.402 20.100 23.35085 22.239 25.660 148.397 10000
## bind.as2() 370.776 412.686 596.47901 425.088 449.036 41799.239 10000
## bind.as3() 347.682 385.743 564.78320 396.435 419.528 42144.355 10000
## bind.as4() 17.534 26.087 30.09639 28.654 32.930 638.915 10000
If there are two columns and you would like to add two column by column then use cbind in dataframe type.
dapu <- cbind(data.frame(data_r), data.frame(data_c))

R data frame manipulation

Suppose I have a data frame that looks like this.
# start end motif
# 2 6 a
# 10 15 b
# 30 35 c
How would I create a data frame that fills in the remaining start and end locations like so up to a certain number Max_end:
Max_end <- 33
# start end motif
# 0 2 na # <- 0-2 are filled in because it is not in the original data frame
# 2 6 a # <- 2-6 are in the original
# 6 10 na # <- 6-10 is not
# 10 15 b # <- 10-15 is
# 15 30 na # and so on
# 30 33 c
And further, calculates the distance between the start and end locations and creates a one column data frame.
# Length motif
# 2 na
# 4 a
# 4 na
# 5 b
# 15 na
# 3 c
Currently this is how i am doing it: It is very inefficient
library(data.table)
library(stringi)
f <- fread('ABC.txt',header=F,skip=1)$V1
f <- paste(f, collapse = "")
motifs = c('GATC', 'CTGCAG', 'ACCACC', 'CC(A|T)GG', 'CCAC.{8}TGA(C|T)')
v <- na.omit(data.frame(do.call(rbind, lapply(stri_locate_all_regex(f, motifs), unlist))))
v <- v[order(v[,1]),]
v2difference <- "blah"
for(i in 2:nrow(v)){
if(v[i,1] > v[i-1,2]+2){v2difference[i] <- v[i,1]-v[i-1,2]-2}
}
v2difference[1] <- v[1,1]
v2 <- data.frame(Order=seq(1, 2*nrow(v), 2),Lengths=matrix(v2difference, ncol = 1),Motifs="na")
v1 <- data.frame(Order=seq(2, 2*nrow(v), 2),Lengths=(v$end-v$start+1),Motifs=na.omit(unlist(stri_extract_all_regex(f,motifs))))
V <- data.frame(Track=1,rbind(v1,v2))
V <- V[order(V$Order),]
B <- V[,!(names(V) %in% "Order")]
Max_end <- 33
breaks <- c(0, t(as.matrix(dat[,1:2])), Max_end) # get endpoints
breaks <- breaks[breaks <= Max_end]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all=T)
# start end motif
# 1 0 2 <NA>
# 2 2 6 a
# 3 6 10 <NA>
# 4 10 15 b
# 5 15 30 <NA>
# 6 30 33 <NA>
# 7 30 35 c
To specify a start and endpoint, you could do
Max_end <- 33
Max_start <- 10
breaks <- unique(c(Max_start, t(as.matrix(dat[,1:2])), Max_end))
breaks <- breaks[breaks <= Max_end & breaks >= Max_start]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all.y=T)
# start end motif
# 1 10 15 b
# 2 15 30 <NA>
# 3 30 33 <NA>
Note: this doesn't include "c" in the shortened final interval, you would need to decide if that values gets included or not when the interval changes.

Resources