finding the captial letters in the string

finding the captial letters in the string - r

I want find the captial letters in the each string and counting how many are there for each string
for example
t = c("gctaggggggatggttactactGtgctatggactac", "gGaagggacggttactaCgTtatggactacT", "gcGaggggattggcttacG")
ldply(str_match_all(t,"[A-Z]"),length)
when applying the above function my output is
1 4 2
But my desire output is
[1] G -1
[2] G -1
C -1
T -2
[3] G -2

You can extract all capital letters and then compute the frequencies with table:
library(stringr)
lapply(str_extract_all(t, "[A-Z]"), table)
# [[1]]
#
# G
# 1
#
# [[2]]
#
# C G T
# 1 1 2
#
# [[3]]
#
# G
# 2

If you extend docendo's answer to be your exact requested format
lapply(stringr::str_extract_all(t, "[A-Z]"),
function(x) {
x = table(x)
paste(names(x), x, sep = "-")
})
# [[1]]
# [1] "G-1"
#
# [[2]]
# [1] "C-1" "G-1" "T-2"
#
# [[3]]
# [1] "G-2"
and how i would do it in tidyverse
library(tidyverse)
data = data.frame(strings = c("gctaggggggatggttactactGtgctatggactac", "gGaagggacggttactaCgTtatggactacT", "gcGaggggattggcttacG"))
data %>%
mutate(caps_freq = stringr::str_extract_all(strings, "[A-Z]"),
caps_freq = map(caps_freq, function(letter) data.frame(table(letter)))) %>%
unnest()
# strings letters Freq
# 1 gctaggggggatggttactactGtgctatggactac G 1
# 2 gGaagggacggttactaCgTtatggactacT C 1
# 3 gGaagggacggttactaCgTtatggactacT G 1
# 4 gGaagggacggttactaCgTtatggactacT T 2
# 5 gcGaggggattggcttacG G 2

Related

Crete a variable "mydata" - A data structure that outputs the following

I am struggling to find a way to implement a matrix to my list. $mix$b becomes a character vector. Please help.
This is the task
# Create a variable "mydata" - a data structure with the output presented
# below as comments
# > print(mydata)
# [[1]]
# [1] "Some long text"
#
# [[2]]
# [1] 1 2 3 4 5
#
# $mix
# $mix$a
# [1] "text"
#
# $mix$b
# a b c
# 1 a -2 3
# 2 b 0 4
# 3 c 2 -5
# 4 d 4 77
This is my best attempt
mydata <- list("Some long text", 1:5,
mix = list(a = 'text', b = c(a = "a", b = -2, c = 3)))
mydata
output
[[1]]
[1] "Some long text"
[[2]]
[1] 1 2 3 4 5
$mix
$mix$a
[1] "text"
$mix$b
a b c
"a" "-2" "3"

mydata <- list("Some long text", 1:5,
mix = list(a = 'text',
b = data.frame(a = c("a", "b", "c", "d"),
b = c(-2,0,2,4),
c = c(3,4,-5,77))))
mydata
[[1]]
[1] "Some long text"
[[2]]
[1] 1 2 3 4 5
$mix
$mix$a
[1] "text"
$mix$b
a b c
1 a -2 3
2 b 2 4
3 c 0 -5
4 d 4 77
Here are some example how to extract the 77:
mydata$mix$b[4,3]
mydata[[3]][[2]][4,3]
mydata[["mix"]][["b"]][4,3]
mydata[[3]][["b"]][4,3]
mydata[["mix"]][[2]][4,3]
77

Group column according to categories in a list

I have data frame like this:
library(dplyr)
set.seed(5)
# generate sample data
df <- data.frame(value = 1:10,
type = sample(LETTERS, 10))
value type
1 1 B
2 2 K
3 3 O
4 4 Y
5 5 I
6 6 U
7 7 G
8 8 S
9 9 C
10 10 F
I want to group the column "type" according categories defined in a list:
groups <- list(LETTERS[1:7],
LETTERS[8:15],
LETTERS[16:20],
"other")
print(groups)
# [[1]]
# [1] "A" "B" "C" "D" "E" "F" "G"
#
# [[2]]
# [1] "H" "I" "J" "K" "L" "M" "N" "O"
#
# [[3]]
# [1] "P" "Q" "R" "S" "T"
#
# [[4]]
# [1] "other"
The output should be like:
value type group
1 1 B 1
2 2 K 2
3 3 O 2
4 4 Y other
5 5 I 2
6 6 U other
7 7 G 1
8 8 S 3
9 9 C 1
10 10 F 1
My approach works as follows:
# group data
df_grouped <- df %>%
mutate(group = ifelse(type %in% groups[[1]], 1,
ifelse(type %in% groups[[2]], 2,
ifelse(type %in% groups[[3]], 3, "other"))))
Since I have many more groups, I do not like the ifelse loop in the code. It is not easy to maintain the code. Is there any more efficiently way to achieve this?

A simple way to do this would be to convert groups to a data frame using reshape2::melt and perform a left_join:
library(dplyr)
library(tidyr)
library(reshape2)
left_join(df, melt(groups), by = c(type = "value")) %>%
replace_na(list(L1 = "other")) %>%
rename(group = L1)
#> value type group
#> 1 1 B 1
#> 2 2 K 2
#> 3 3 O 2
#> 4 4 Y other
#> 5 5 I 2
#> 6 6 U other
#> 7 7 G 1
#> 8 8 S 3
#> 9 9 C 1
#> 10 10 F 1
A base R method that gives the same result would be
df$group <- sapply(type, function(s) {
i <- which(sapply(groups, function(g) s %in% g))
if(length(i) < 1) "other" else i
}))

We can use enframe with join
library(dplyr)
library(tibble)
library(tidyr)
enframe(groups, value = 'type') %>%
unnest(c(type)) %>%
right_join(df)

Here is a base R option using stack + merge
out <- type.convert(merge(df,stack(setNames(groups,seq_along(groups))),by.x = "type",by.y = "values",all.x = TRUE))
replace(out,is.na(out),"other")[match(df$value,out$value),]
which gives
type value ind
1 B 1 1
6 K 2 2
7 O 3 2
10 Y 4 other
5 I 5 2
9 U 6 other
4 G 7 1
8 S 8 3
2 C 9 1
3 F 10 1

Convert the list to a named vector and use a standard lookup:
df$group = replace(v <- setNames(rep(seq_along(groups), lengths(groups)),
unlist(groups))[df$type], is.na(v), "other")
Another base alternative: The levels of a factor are renamed using a named list:
df$group = factor(df$type)
levels(df$group) = setNames(groups, seq_along(groups))
Now the "other" group is represented by NA. If you wish to change it:
df$group = as.character(df$group)
df$group[is.na(df$group)] = "other"

apply similar variable to multiple dataset in r

I have 6 data named from dat1 to dat6, I want to add variable region and label them in a similar way, like this:
dat1$region <- paste("NE-1")
dat2$region <- paste("NE-2")
dat3$region <- paste("NE-3")
dat4$region <- paste("NE-4")
dat5$region <- paste("NE-5")
How can I write this code in a more concise way? using apply or for-loop?
Thanks!!

One option is to use get and assign functions in a for-loop.
Sample data:
dat1 <- data.frame(id=1:4, region = letters[1:4])
dat2 <- data.frame(id=5:8, region = letters[5:8])
dat3 <- data.frame(id=9:12, region = letters[9:12])
dat4 <- data.frame(id=13:16, region = letters[13:16])
dat5 <- data.frame(id=17:20, region = letters[17:20])
dat1
# id region
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
Apply for-loop:
for(i in 1:5){
name = paste("dat",i,sep="")
temp <- get(name)
temp$region = paste("NE",i,sep = "-")
assign(name, temp)
}
Verify results:
dat1
# id region
# 1 1 NE-1
# 2 2 NE-1
# 3 3 NE-1
# 4 4 NE-1
dat5
# id region
# 1 17 NE-5
# 2 18 NE-5
# 3 19 NE-5
# 4 20 NE-5

Keep all dataframes in a list then use lapply:
# example dataframes
dat1 <- cars[1:2, ]
dat2 <- cars[3:4, ]
dat3 <- cars[5:6, ]
myList <- list(dat1, dat2, dat3)
# myList
# [[1]]
# speed dist
# 1 4 2
# 2 4 10
#
# [[2]]
# speed dist
# 3 7 4
# 4 7 22
#
# [[3]]
# speed dist
# 5 8 16
# 6 9 10
Then it is easier to do repetitive operations. Loop through the list, add region column:
res <- lapply(seq_along(myList), function(i){
x <- myList[[ i ]]
x$region <- paste0("NE-", i)
x
})
res
# [[1]]
# speed dist region
# 1 4 2 NE-1
# 2 4 10 NE-1
#
# [[2]]
# speed dist region
# 3 7 4 NE-2
# 4 7 22 NE-2
#
# [[3]]
# speed dist region
# 5 8 16 NE-3
# 6 9 10 NE-3

How about this (assuming all your items start with dat and end with a unique identifier string):
dat_names <- ls()[grepl("^dat", ls())]
dat_ID <- sapply(dat_names, function(d) gsub("dat", "", d))
for(d in 1:length(dat_names)) {
dat_names[[d]]$region <- paste("NE-", dat_ID[d], sep="")
}

How to increment vector in r by a fixed value and generate histogram of each iteration

I'm looking to iterate each value in the vector by 1 until a set value is reached and saving each iteration in a vector, and further iterations do not include values past the set value. So for instance say the set value is 3. Consider this vector, A <- c(1,1,2). Then the desired outcome should be:
Outcome:
1 1 2
2 2 3
3 3
Then I want to store each line in a vector so I can plot a histogram
so with each vector outcome including the original vector.
hist(c(1,1,2))
hist(c(2,2,3))
hist(c(3,3))
Potential code:
for (i in 1:length(A)) {
A[i] <- A + 1
}

# given values
A <- c(1, 1, 2)
value <- 3
# incrementations
out_lst <- lapply(A, function(x) x : 3)
# [[1]]
# [1] 1 2 3
#
# [[2]]
# [1] 1 2 3
#
# [[3]]
# [1] 2 3
# histograms
hist_lst <- list()
max_len <- max(sapply(out_lst, function(x) length(x)))
for(l in 1:max_len) {
hist_lst[[l]] <- sapply(out_lst, function(x) x[l])
}
hist_lst
# [[1]]
# [1] 1 1 2
#
# [[2]]
# [1] 2 2 3
#
# [[3]]
# [1] 3 3 NA
par(mfrow = c(1, length(hist_lst)))
invisible(lapply(hist_lst, hist))

You can use a while loop:
funfun=function(vec,max){
y=list()
i=1
while(length(vec)!=0){
y[[i]]=vec
vec=vec+1
vec=`attributes<-`(na.omit(replace(vec,vec>max,NA)),NULL)
i=i+1
}
y
}
funfun(c(1,1,2),3)
[[1]]
[1] 1 1 2
[[2]]
[1] 2 2 3
[[3]]
[1] 3 3
you can now do
sapply(funfun(c(1,1,2),3),hist)

How to permut list level in R

is there's a way to permute list levels in R?
In other words, how can i simply go from test list to test2?
test=list(
a=list("alpha"=1:2,"beta"=3:5),
b=list("alpha"=5:6,"omega"=7:10)
)
test2 <- list(
alpha=list("a"=1:2,"b"=5:6),
beta=list("a"=3:5),
omega=list("b"=7:10)
)

How about this:
tst <- unlist(test, recursive = FALSE)
lst <- split(tst, gsub("[a-z]\\.|[0-9]", "", names(tst)))
lapply(lst, function(z) setNames(z, substring(names(z), 1, 1)))
# $alpha
# $alpha$a
# [1] 1 2
#
# $alpha$b
# [1] 5 6
#
#
# $beta
# $beta$a
# [1] 3 4 5
#
#
# $omega
# $omega$b
# [1] 7 8 9 10

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

finding the captial letters in the string - r

You can extract all capital letters and then compute the frequencies with table: library(stringr) lapply(str_extract_all(t, "[A-Z]"), table) # [[1]] # # G # 1 # # [[2]] # # C G T # 1 1 2 # # [[3]] # # G # 2

Related

Crete a variable "mydata" - A data structure that outputs the following

Group column according to categories in a list

apply similar variable to multiple dataset in r

How to increment vector in r by a fixed value and generate histogram of each iteration

How to permut list level in R

Categories

Resources