Count patterns and differentiate them - r

I'd like to count a defined pattern (here: 'Y') in a string for each row of a dataframe. Ideally, I'd like to get a number of occurrences in V3 and length in V4.
Input:
V1 V2
A XXYYYYY
B XXYYXX
C XYXXYX
D XYYXYX
Output:
V1 V2 V3 V4
A XXYYYYY 1 5
B XXYYXX 1 2
C XYXXYX 2 1,1
D XYYXYX 2 2,1
I tried different modifications of the function below, with no success.
dict <- setNames(nm=c("Y"))
seqs <- df$V2
sapply(dict, str_count, string=seqs)
Thanks in advance!

another base R solution but using regexpr:
df <- data.frame(
V1 = c("A", "B", "C", "D"),
V2 = c("XXYYYYY", "XXYYXX" , "XYXXYX", "XYYXYX")
)
extract match.length attribute of the regexpr output, then count length of each attribute (which tells you how many matches there are):
r <- gregexpr("Y+", df$V2)
len <- lapply(r, FUN = function(x) as.array((attributes(x)[[1]])))
df$V3 <- lengths(len)
df$V4 <- len
df
#V1 V2 V3 V4
#1 A XXYYYYY 1 5
#2 B XXYYXX 1 2
#3 C XYXXYX 2 1, 1
#4 D XYYXYX 2 2, 1
if you have an old version of R that doesn't have lengths yet you can use df$V3 <- sapply(len, length) instead.
and if you need a more generic function to do the same for any vector x and pattern a:
foo <- function(x, a){
ans <- data.frame(x)
r <- gregexpr(a, x)
len <- lapply(r, FUN = function(z) as.array((attributes(z)[[1]])))
ans$quantity <- lengths(len)
ans$lengths <- len
ans
}
try foo(df$V2, 'Y+').

Here is a stringr solution:
df <- data.frame(
V1 = c("A", "B", "C", "D"),
V2 = c("XXYYYYY", "XXYYXX" , "XYXXYX", "XYYXYX")
)
df$V3 <- str_count(df$V2, "Y+")
df$V4 <- lapply(str_locate_all(df$V2, "Y+"), function(x) {
paste(x[, 2] - x[, 1] + 1, collapse = ",")
})

In base R:
aaa <- data.frame(V1 = LETTERS[1:4],
V2 = c("XXYYYYY", "XXYYXX", "XYXXYX", "XYYXYX"),
stringsAsFactors = FALSE)
# split into strings of "Y"s
splt <- lapply(aaa$V2, function(x) unlist(strsplit(x, "[^Y]+"))[-1])
# number of occurrences
aaa$V3 <- lapply(splt, length)
# length of each occurence
aaa$V4 <- lapply(splt, function(x) paste(nchar(x), collapse = ","))

Related

Take last n columns with value

i have a basic R question: imagine the following code:
a <- c("A","B","C")
b <- c("A","B","C")
c <- c("A","X","C")
x <- c("A","B","C")
y <- c("","B","C")
z <- c("","","C")
frame <- data.frame(a,b,c,x,y,z)
now i want to get the content of the last 3 columns but only if they contain value. So the Output should look like this
new1 <- c("A","X","C")
new2 <- c("A","B","C")
new3 <- c("A","B","C")
frame2 <- data.frame(new1,new2,new3)
I am thankful for every help.
Using apply from base R
as.data.frame(t(apply(frame, 1, FUN = function(x) tail(x[nzchar(x)], 3))))
You can do,
new_frame <- frame[colSums(frame == '') == 0]
new_frame[tail(seq_along(new_frame), 3)]
b c x
1 A A A
2 B X B
3 C C C

Function to select value from data frame based on value in list of lists

In R I need to write a function that applies the following rules:
fkt <- function(v1,v2){
- find list name within listOfLists that contains v1
- go to column in df, where listname == colname
- return value from row where df$col2 == v2
}
For Example:
df <- data.frame(Col1= c(1,2,3,4,5),
Col2= c("AAA","BBB","CCC","DDD","EEE"),
A = c("22","22","23","23","24"),
B = c("210","210","202","220","203"),
C = c("2000","2000","2010","2010","2200")
)
listOflists <- list(A <- c(1281, 1282, 1285, 1286, 1289),
B <- c(100,200,300,400,500,600,700,800,900,101,202,303,404,505,606),
C <-c(1000,1500,2000,2500,3000,3050,4000,4500,6000)
)
Then
fkt(800,"BBB")
> 210
I tried
fkt<- function(v1,v2){
r <- which(df$Col2== v1)
s <- ifelse(v2 %in% A, df$A[r],
ifelse( v2 %in% B ,df$A[r],df$C[r]))
return(s)
}
Alas, the result is NA.
And writing many ifelse() statements is not efficient - especially because the listOfLists might comprise >50 lists.
Can anyone advice me how to write this function in a general, programming efficient way as described above?
df <- data.frame(Col1= c(1,2,3,4,5),
Col2= c("AAA","BBB","CCC","DDD","EEE"),
A = c("22","22","23","23","24"),
B = c("210","210","202","220","203"),
C = c("2000","2000","2010","2010","2200"))
# Be cautious : = and <- are not equivalent, you were creating variables not named fields
listOflists <- list(A = c(1281, 1282, 1285, 1286, 1289),
B = c(100,200,300,400,500,600,700,800,900,101,202,303,404,505,606),
C = c(1000,1500,2000,2500,3000,3050,4000,4500,6000))
f <- function(v1)
for (i in 1:length(listOflists))
if (v1 %in% listOflists[[i]]) return(names(listOflists)[i])
fkt <- function(v1,v2) df[df$Col2==v2,f(v1)]
fkt(800,"BBB")
#[1] 210
#Levels: 202 203 210 220

R pairwise operations

I have a data frame and would like to perform some specific operations on it.
dat <- data.frame(Name = LETTERS[1:3],
Val1 = rnorm(3),
Val2 = rnorm(3))
# > dat
# Name Val1 Val2
# 1 A -1.055050 0.4499766
# 2 B 0.414994 -0.5999369
# 3 C -1.311374 -0.3967634
I would like to do the following:
Pair-wise divide Val1 across the Names, e.g.
AB1 <- dat[dat$Name == "A", "Val1"] / dat[dat$Name == "B", "Val1"]
AC1 <- dat[dat$Name == "A", "Val1"] / dat[dat$Name == "C", "Val1"]
BC1 <- dat[dat$Name == "B", "Val1"] / dat[dat$Name == "C", "Val1"]
Pair-wise divide Val2 across the Names, e.g.
AB2 <- dat[dat$Name == "A", "Val2"] / dat[dat$Name == "B", "Val2"]
AC2 <- dat[dat$Name == "A", "Val2"] / dat[dat$Name == "C", "Val2"]
BC2 <- dat[dat$Name == "B", "Val2"] / dat[dat$Name == "C", "Val2"]
Subtract 2 from 1, e.g.
AB3 <- AB1 - AB2
AC3 <- AC1 - AC2
BC3 <- BC1 - BC2
The above works fine but I'd like to implement this in a smarter and scalable way (e.g many more Names and Vals), as well as storing the output in a data.frame where it is easier to programmatically extract values.
Finally, an even better solution would do this for the following data
dat2 <- data.frame(Region = rep(LETTERS[24:26], each=3),
Name = rep(LETTERS[1:3], 3),
Val1 = rep(rnorm(3), 3),
Val2 = rep(rnorm(3), 3))
> dat2
# Region Name Val1 Val2
# 1 X A 2.1098629 0.5779044
# 2 X B 0.5937334 0.1410554
# 3 X C 0.2819461 -1.1769578
# 4 Y A 2.1098629 0.5779044
# 5 Y B 0.5937334 0.1410554
# 6 Y C 0.2819461 -1.1769578
# 7 Z A 2.1098629 0.5779044
# 8 Z B 0.5937334 0.1410554
# 9 Z C 0.2819461 -1.1769578
Where the operations are the same as above but grouped by Region, so the output would be something like
> output
# Region AB3 AC3 BC3
# 1 X ? ? ?
# 2 Y ? ? ?
# 3 Z ? ? ?
where the ? are the actual results.
combn is a work-horse here, which can be used to generate unique pairwise combinations:
combn(as.character(dat$Name), 2, simplify=FALSE)
#[[1]]
#[1] "A" "B"
#
#[[2]]
#[1] "A" "C"
#
#[[3]]
#[1] "B" "C"
You can also pass the results of these pairwise combinations to a function then:
# set.seed(1)
##for reproducibility
combn(
as.character(dat$Name),
2,
FUN=function(x) do.call(`-`, dat[dat$Name == x[1], -1] / dat[dat$Name == x[2], -1])
)
#[1] -8.2526585 2.6940335 0.1818427
AB3
#[1] -8.252659
AC3
#[1] 2.694033
BC3
#[1] 0.1818427
With data.table, you can do it using the code below:
library(data.table)
dat <- data.table(Region = rep(LETTERS[24:26], each=3),
Name = rep(LETTERS[1:3], 3),
Val1 = rep(rnorm(3), 3),
Val2 = rep(rnorm(3), 3))
dat2 <- merge(dat, dat, by="Region", allow.cartesian = T)[Name.x < Name.y]
dat2[, Val1Ratio := Val1.x / Val1.y]
dat2[, Val2Ratio := Val2.x / Val2.y]
dat2[, Diff := Val1Ratio - Val2Ratio]
#thelatemail's answer worked well on the first part of my question. I originally tried to add the below information as an add-on edit to their answer to address the second part of my question. This edit was rejected so I'm putting it here as an answer:
One way to use #thelatemail's answer to address the second part of the question (relating to dat2 and grouping by Region) is the following:
library("dplyr")
Regions <- unique(dat2$Region)
out <- data.frame(Region = Regions, AB3 = NA, AC3 = NA, BC3 = NA)
for (i in 1:length(Regions)){
dat2temp <- dat2 %>% filter(Region==Regions[i])
out[i,2:4] <- combn(
as.character(dat2temp$Name),
2,
FUN = function(x) do.call(`-`, dat2temp[dat2temp$Name == x[1], -(1:2)] / dat2temp[dat2temp$Name == x[2], -(1:2)])
)
}
> out
# Region AB3 AC3 BC3
# 1 X -4.368693 -0.4772375 0.3004291
# 2 Y -4.368693 -0.4772375 0.3004291
# 3 Z -4.368693 -0.4772375 0.3004291
There are probably better solutions that avoid a for-loop, and I'd love to hear them. I expect there's a cleaner solution that uses dplyr::group_by or cut or similar.

column of unique values between of two other columns

sample data:
col1 col2
<NA> cc
a a
ab a
z a
I want to add a column unique with these values -- any valued that isn't shared between col1 and col2.
col1 col2 unique
<NA> cc cc
a a
ab a b
z a za
I tried using setdiff but
(for replication purposes:)
df <- read.table(header=TRUE, stringsAsFactors = FALSE, text =
"col1 col2
NA cc
a a
ab a
z a
")
Like this:
df$unique <- paste0(setdiff(df$col1, df$col2), setdiff(df$col2, df$col1))
But it returns
Error in `$<-.data.frame`(`*tmp*`, "unique", value = c("<NA>cc", "abcc" :
replacement has 2 rows, data has 3
From the error it looks like it's generating a vector of the differences between the columns, instead of the differences between the elements...
Edit: Added z and a sample data in last row.
You could do this using setdiff and Reduce in base R:
cols <- c(1,2)
df$unique <- unlist(lapply(apply(df[cols], 1, function(x)
Reduce(setdiff, strsplit(na.omit(x), split = ""))), paste0, collapse=""))
# col1 col2 unique
# 1 <NA> cc cc
# 2 a a
# 3 ab a b
Here is a length method with apply.
apply(df, 1, function(i) {
i <- i[!is.na(i)] # remove NAs
if(length(i[!is.na(i)]) == 1) i # check length and return singletons untouched
else { # for non-singletons
i <- unlist(strsplit(i, split="")) # strsplit and turn into a vector
i <- i[!(duplicated(i) | duplicated(i, fromLast=TRUE))] # drop duplicates
paste(i, collapse="")}}) # return collapsed singleton set of characters
[1] "cc" "" "b"
Note that for c("cc", "a", "c"), this will return "a" because "cc" and "c" will be marked as duplicates.
We need to split the string first:
df$unique <- mapply(function(x, y){
u <- setdiff(union(x, y), intersect(x, y))
paste0(u[!is.na(u)], collapse = '')
}, strsplit(df$col1, ''), strsplit(df$col2, ''))
# >df
# col1 col2 unique
# 1 <NA> cc c
# 2 a a
# 3 ab a b

R extract matching values from list of data frames

I have a relatively large amount of data stored in a list of data frames with several columns.
For each element of the list I wish to check one column against a reference and if present extract the value held in another column of the same element and place in a new summary matrix.
e.g. with the following example code:
add1 = c("N1","N1","N1")
coords1 = c(1,2,3)
vals1 = c("a","b","c")
extra1 = c("x","y","x")
add2 = c("N2","N2","N2","N2")
coords2 = c(2,3,4,5)
vals2 = c("b","c","d","e")
extra2 = c("z","y","x","x")
add3 = c("N3","N3","N3")
coords3 = c(1,3,5)
vals3 = c("a","c","e")
extra3 = c("z","z","x")
df1 <- data.frame(add1, coords1, vals1, extra1)
df2 <- data.frame(add2, coords2, vals2, extra2)
df3 <- data.frame(add3, coords3, vals3, extra3)
list_all <- list(df1, df2, df3)
coordinate.extract <- unique(unlist(lapply(list_all, "[", 1)))
my_matrix <- matrix(0, ncol = length(list_all)
, nrow = (length(coordinate.extract)))
my_matrix_new <- cbind(as.character(coordinate.extract)
, my_matrix)
I would like to end up with:
my_matrix_new = V1 V2 V3 V4
1 a a
2 b b
3 c c c
4 d
5 e e
i.e. the 3rd column of each list element is chosen based on the value of the second column.
I hope this is clear.
Thanks,
Matt
I would use data.frame as there are mixed classes. You may try merge with Reduce to get the expected output. Select the 2nd and 3rd columns,in each list element, change the column name for the 2nd to be same across all the list elements, merge, and if needed replace the NA elements with ''
lst1 <- lapply(list_all, function(x) {names(x)[2] <- 'V1';x[2:3] })
res <- Reduce(function(...) merge(..., by='V1', all=TRUE), lst1)
res[-1] <- lapply(res[-1], as.character)
res[is.na(res)] <- ''
res
# V1 vals1 vals2 vals3
#1 1 a a
#2 2 b b
#3 3 c c c
#4 4 d
#5 5 e e
We can change the column names
names(res) <- paste0('V', seq_along(res))

Resources