Split a vector using specific elements as delimiters - r

For example in the following vector I want to use the zero elements as delimiters
x <- c(12, 1, 2, 15, 18, 0, 15, 13, 14, 9, 1, 0, 22, 9, 14, 3, 9, 20)
and get (maybe in a list) the vectors
c(12, 1, 2, 15, 18)
c(15, 13, 14, 9, 1)
c(22, 9, 14, 3, 9, 20)
How would you do that in R?

For example:
splt <- cumsum(x == 0)
splt[x == 0] <- NA
split(x, splt)
# $`0`
# [1] 12 1 2 15 18
# $`1`
# [1] 15 13 14 9 1
# $`2`
# [1] 22 9 14 3 9 20

You may do like this,
x <- c(12, 1, 2, 15, 18, 0, 15, 13, 14, 9, 1, 0, 22, 9, 14, 3, 9, 20)
lapply(strsplit(paste(x, collapse = ";"), "^0;|;0$|;0;"), function(y) {strsplit(y, ";")})[[1]]
# [[1]]
# [1] "12" "1" "2" "15" "18"
# [[2]]
# [1] "15" "13" "14" "9" "1"
# [[3]]
# [1] "22" "9" "14" "3" "9" "20"

Another option is using rleid from library(data.table) to create the grouping variable and then split the values of 'x' that are not 0 (x[!!x]) with the grouping variable.
library(data.table)
split(x[!!x], rleid(!x)[!!x])

Related

Obtain mean vector for each parameter combination using R

I have a cvs file that has the following structure (minimum example):
ID Variable Vector
1 a [0,0,0]
2 a [1,2,3]
1 a [1,1,2]
2 a [1,2,3]
1 b [0,0,0]
2 b [1,1,1]
1 b [0,0,1]
2 b [3,5,7]
I would like to calculate the mean vector for each combination of parameters (in this case, ID and Variable). That is, I want to obtain a dataframe like the following one:
ID Variable Vector
1 a [0.5,0.5,1]
2 a [1,2,3]
1 b [0,0,0.5]
2 b [2,3,4]
I have generated this csv file with Python, that's why I have that structure with brackets. But I do not know how to start to do this using R. It doesn't seem to be a common data structure.
Update:
Vector variable structure (obtained from dput(head(data, 8))
Vector = c("[3, 16, 14, 5, 6, 13, 17, 7, 13, 6]",
"[7, 12, 6, 10, 6, 5, 16, 9, 19, 10]", "[4, 13, 4, 11, 6, 15, 17, 10, 12, 8]",
"[18, 11, 16, 8, 10, 10, 7, 4, 9, 7]", "[9, 9, 10, 17, 8, 13, 3, 13, 8, 10]",
"[17, 12, 7, 13, 6, 13, 8, 9, 5, 10]", "[9, 6, 14, 10, 8, 4, 8, 14, 15, 12]",
"[7, 13, 8, 10, 16, 8, 13, 13, 8, 4]")), row.names = c(NA, 8L
), class = "data.frame")
Assuming the 'Vector' column is a list, after grouping by 'ID', 'Variable', we reduce the 'Vector' by adding (+) the corresponding elements together and then divide by the total number of elements (n()) in that group
library(dplyr)
library(purrr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list(reduce(Vector, `+`)/n()), .groups = 'drop')
-output
out
# A tibble: 4 x 3
# ID Variable Vector
# <dbl> <chr> <list>
#1 1 a <dbl [3]>
#2 1 b <dbl [3]>
#3 2 a <dbl [3]>
#4 2 b <dbl [3]>
out$Vector
#[[1]]
#[1] 0.5 0.5 1.0
#[[2]]
#[1] 0.0 0.0 0.5
#[[3]]
#[1] 1 2 3
#[[4]]
#[1] 2 3 4
If the column 'Vector' is a character string, an option is to extract the numeric part into a list
library(stringr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list((str_extract_all(Vector, "\\d+") %>%
map(as.numeric) %>% reduce(`+`))/n()), .groups = 'drop')
data
df1 <- structure(list(ID = c(1, 2, 1, 2, 1, 2, 1, 2), Variable = c("a",
"a", "a", "a", "b", "b", "b", "b"), Vector = structure(list(c(0,
0, 0), c(1, 2, 3), c(1, 1, 2), c(1, 2, 3), c(0, 0, 0), c(1, 1,
1), c(0, 0, 1), c(3, 5, 7)), class = "AsIs")), class = "data.frame",
row.names = c(NA,
-8L))

How shift values in rows in R (dplyr or data.table)?

Here is data set 'before' and 'after' shifting.
# Data set 'before'
df_before <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13)))
# Shift operation
# ...
# Data set 'after'
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
How to make this kind of shifting on +1 cell only for all rows?
Thanks!
Something like this? Just start the rows always shifted by one and reset their length. The latter adds NAs.
t(sapply(1:nrow(DF), function(x) `length<-`(DF[x, x:ncol(DF)], ncol(DF))))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 2 3 4 5
# [2,] 6 7 8 9 NA
# [3,] 11 12 13 NA NA
Data
DF <- structure(c(1, 0, 0, 2, 6, 0, 3, 7, 11, 4, 8, 12, 5, 9, 13), .Dim = c(3L,
5L), .Dimnames = list(c("x", "y", "z"), NULL))
Taking a guess at the logic:
t(apply(df_before, 1, function(x) `length<-`(x[x != 0], ncol(df_before))))
[,1] [,2] [,3] [,4] [,5]
x 1 2 3 4 5
y 6 7 8 9 NA
z 11 12 13 NA NA
You can un-transpose the df_before data.frame then use the lead function from dplyr
to shift the columns
library(data.table)
library(dplyr)
df_before <- data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13))
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
df_before[] <-lapply(1:ncol(df_before), function(x){
dplyr::lead(df_before[[x]],n= x-1)
})
If you need to transpose the data after this step:
df_after2 <- t(df_before)
all.equal(df_after,df_after2) # TRUE

How to generate sequence with exclusions in R

I need 4 functions that generate some numbers (each)
First function generates sequence from n odd numbers except 5, 15, 25, etc...
example with n=2: 1, 1, 3, 3, 7, 7, 9, 9, 11, 11, 13, 13, 17, 17,...
Second function generates sequence from n even numbers except 10, 20, 30, etc...
example witn n=2: 2, 2, 4, 4, 6, 6, 8, 8, 12, 12, 14, 14, 16, 16,...
Third function generates sequence from n numbers from 5 by 10
example witn n=2: 5, 5, 15, 15, 25, 25,...
Fourth function generates sequence from n numbers from 10 by 10
example witn n=2: 10, 10, 20, 20, 30, 30,...
Each function has to get vector 1: N and n as inputs.
For example,
f1(1:10, 3)
> 1, 1, 1, 3, 3, 3, 7, 7, 7, 9
f2(1:5, 10)
> 2, 2, 2, 2, 2
f3(1:15, 5)
> 5, 5, 5, 5, 5, 15, 15, 15, 15, 15, 25, 25, 25, 25, 25
f4(1:2, 1)
> 10, 20
I have some decision for first two functions but I don`t know how to exclude some numbers:
f1 <- function(x) 2*((x-1) %/% 10) + 1 # goes 1, 3, 5, etc for n = 10
f2 <- function(x) 2*((x-1) %/% 10 + 1) # goes 2, 4, 6, etc for n = 10
why not use seq and rep ?
n = 25
nrep = 2 # number of repetitions
by5 <- sort(rep(seq(5, n, by = 10), nrep )) # numbers from 5 by 10
by5
by10 <- sort(rep(seq(10, n, by = 10), nrep )) # numbers from 10 by 10
by10
odd <- sort(rep(seq(1, n, by = 2), nrep )) # odd number
odd[!odd %in% by5] # remove all the by5 values
even <- sort(rep(seq(2, n, by = 2), nrep )) # Even numbers
even[!even %in% by10] # remove all the by 10 values
output
> [1] 5 5 15 15 25 25
> [1] 10 10 20 20
> [1] 1 1 3 3 7 7 9 9 11 11 13 13 17 17 19 19 21 21 23 23
> [1] 2 2 4 4 6 6 8 8 12 12 14 14 16 16 18 18 22 22 24 24.

2 columns into list and sort in R

Let's say we have two list
x <- c(1, 3, 4, 2, 6, 5)
y <- c(12, 14, 15, 61, 71, 21)
I want to combine into a list so that we have 2 column x and y and values should be in same order.
x <- c(1, 3, 4, 2, 6, 5)
y <- c(12, 14, 15, 61, 71, 21)
After you have a list I want to sort it on y so the final list looks like
x <- c(1, 3, 4, 5, 2, 6)
y <- c(12, 14, 15, 21, 61, 71)
I am really new to R.
I tried list(x,y) but it seems to make a
list(1, 3, 4, 2, 6, 5, 12, 14, 15, 61, 71, 21)
so I was wondering someone could help me.
You need to put them in a data.frame first and then use order:
x <- c(1, 3, 4, 2, 6, 5)
y <- c(-12, 14, 15, 61, 71, 21)
DF <- data.frame(x, y)
> DF[order(DF$y),]
x y
1 1 -12
2 3 14
3 4 15
6 5 21
4 2 61
5 6 71
keeping as a list, using lapply:
x <- c(1, 3, 4, 2,6,5)
y <- c(12, 14,15,61,71,21)
l <- list(x = x, y = y)
## thelatemail
lapply(l, `[`, order(l$y))
# $x
# [1] 1 3 4 5 2 6
#
# $y
# [1] 12 14 15 21 61 71
a more explicit version of the short one given by #thelatemail above but doesn't preserve the names:
lapply(seq_along(l), function(x) l[[x]][order(l$y)])
# [[1]]
# [1] 1 3 4 5 2 6
#
# [[2]]
# [1] 12 14 15 21 61 71
or rapply:
rapply(l, function(x) x[order(l$y)], how = 'list')
# $x
# [1] 1 3 4 5 2 6
#
# $y
# [1] 12 14 15 21 61 71

Another nested loop in R

I have the following data and nested for loop:
x <- c(12, 27, 21, 16, 12, 21, 18, 16, 20, 23, 21, 10, 15, 26, 21, 22, 22, 19, 26, 26)
y <- c(8, 10, 7, 7, 9, 5, 7, 7, 10, 4, 10, 3, 9, 6, 4, 2, 4, 2, 3, 6)
a <- c(20,25)
a.sub <- c()
df <- c()
for(j in 1:length(a)){
a.sub <- which(x >= a[j])
for(i in 1:length(a.sub)){
df[i] <- y[a.sub[i]]
}
print(df)
}
I'd like the loop to return values for df as:
[1] 10 6 3 6 4 10 6 4 2 4 3 6
[1] 10 6 3 6
As I have it, however, the loop returns the same values twice of df for a <- 20 but not a <- 25:
[1] 10 7 5 10 4 10 6 4 2 4 3 6
[1] 10 6 3 6 4 10 6 4 2 4 3 6
for(i in 1:length(a.sub)){
df[i] <- y[a.sub[i]]
}
can become
df <- y[a.sub]
neither a.sub nor df need to be predefined then and thus...
x <- c(12, 27, 21, 16, 12, 21, 18, 16, 20, 23, 21, 10, 15, 26, 21, 22, 22, 19, 26, 26)
y <- c(8, 10, 7, 7, 9, 5, 7, 7, 10, 4, 10, 3, 9, 6, 4, 2, 4, 2, 3, 6)
a <- c(20,25)
for(j in 1:length(a)){
a.sub <- which(x >= a[j])
df <- y[a.sub]
print(df)
}
It could be made shorter. df is unnecessary if you're just printing the subset of y anyway. Just print it directly. And the selector is so short it wouldn't make a single line confusing. Furthermore, why use length of a and index.. loop through a directly. So, it could be...
a <- c(20,25)
for(ax in a){
print( y[ which(x >= ax) ] )
}
Not sure if this is a simplified version of a more complex problem, but I'd probably solve this using some direct indexing and an apply function. Something like this:
z <- cbind(x,y)
sapply(c(20,25), function(x) z[z[, 1] >= x, 2])
[[1]]
[1] 10 7 5 10 4 10 6 4 2 4 3 6
[[2]]
[1] 10 6 3 6

Resources