Related
Given the following named vector:
x <- c(54, 36, 67, 25, 76)
names(x) <- c('a', 'b', 'c', 'd', 'e')
How one can extract the elements between 'b' and 'd'? I can do that for data tables with the dplyr::select(dt, b:d) but for some reason, I cannot find a solution for named vectors (all the examples I find are for extracting element(s) by giving all the names not a range of names)...
You could do
x[which(names(x) == "b"):which(names(x) == "d")]
#> b c d
#> 36 67 25
The problem being that there is no guarantee in a named vector that names are unique, and if there are duplicate names the entire concept becomes meaningless.
If you wanted a complete solution that allows for tidyverse-style non-standard evaluation and sensible error messages you could have
subset_named <- function(data, exp)
{
if(missing(exp)) return(data)
exp <- as.list(match.call())$exp
if(is.numeric(exp)) return(data[exp])
if(is.character(exp)) return(data[exp])
tryCatch({
ss <- suppressWarnings(eval(exp))
return(data[ss])},
error = function(e)
{
if(as.character(exp[[1]]) != ":")
stop("`exp` must be a sequence created by ':'")
n <- names(data)
first <- as.character(exp[[2]])
second <- as.character(exp[[3]])
first_match <- which(n == first)
second_match <- which(n == second)
if(length(first_match) == 0)
stop("\"", first, "\" not found in names(",
deparse(substitute(data)), ")")
if(length(second_match) == 0)
stop("\"", second, "\" not found in names(",
deparse(substitute(data)), ")")
if(length(first_match) > 1) {
warning("\"", first,
"\" found more than once. Using first occurence only")
first_match <- first_match[1]
}
if(length(second_match) > 1) {
warning("\"", second,
"\" found more than once. Using first occurence only")
second_match <- second_match[1]
}
return(data[first_match:second_match])
})
}
That allows the following behaviour:
subset_named(x, "b":"d")
#> b c d
#> 36 67 25
subset_named(x, b:d)
#> b c d
#> 36 67 25
subset_named(x, 1:3)
#> a b c
#> 54 36 67
subset_named(x, "e")
#> e
#> 76
subset_named(x)
#> a b c d e
#> 54 36 67 25 76
One option could be:
x[Reduce(`:`, which(names(x) %in% c("b", "d")))]
b c d
36 67 25
You can use match in base R :
x[match('b', names(x)):match('d', names(x))]
# b c d
#36 67 25
Or if you want to use something like b:d convert it into dataframe as column
library(dplyr)
t(x) %>%
as.data.frame() %>%
select(b:d)
1) subset In base R this can be done using the select argument of subset. The only catch is that only the data.frame method of subset supports the select argument but we can convert x to a data.frame and then convert back. It also allows more complex specifications such as c(b:d, d) .
unlist(subset(data.frame(as.list(x)), select = b:d))
## b c d
## 36 67 25
2) evalq Another base R possibility is to create a list with the values 1, 2, 3, ... and the same names as x and then evaluate b:d with respect to it giving the desired indexes which can then be indexed into x. This also allows complex specifications as in (1).
x[ evalq(b:d, setNames(as.list(seq_along(x)), names(x))) ]
## b c d
## 36 67 25
We could turn this into a function like this:
sel <- function(x, select, envir = parent.frame()) {
ix <- setNames(as.list(seq_along(x)), names(x))
x[ eval(substitute(select), ix, envir) ]
}
sel(x, b:d)
sel(x, c(b:c, d))
sel(x, d:b) # reverse order
3) logical condition Again with only base R, if the names are in sorted order, as in the question, then we can check for names between the endpoints:
x[names(x) >= "b" & names(x) <= "d"]
## b c d
## 36 67 25
4) zoo If the names are in ascending order, as in the question, we could create a zoo series with those names as the times and then use window.zoo to pick out the subseries and finally convert back.
library(zoo)
coredata(window(zoo(x, names(x)), start = "b", end = "d"))
## b c d
## 36 67 25
I currently have a string in R that looks like this:
a <- "BMMBMMMMBMMMBMMBBMMM"
First, I need to determine the frequency of different patterns of "M" that appear in the string.
In this example it would be:
MM = 2
MMM = 2
MMMM = 1
Secondly, I then need to designate a numerical value/score for each different pattern.
i.e:
MM = 1
MMM = 2
MMMM = 3
This would mean that the total value/score of M's in a would equal 9.
If anyone knows any script that would allow me to do this for multiple strings like this in a dataframe that would be great?
Thank you.
a <- "BMMBMMMMBMMMBMMBBMMM"
tbl <- table(strsplit(a, "B"), exclude="")
tbl
# MM MMM MMMM
# 2 2 1
score <- sum(tbl * 1:3)
score
# 9
You could also use the table function.
a_list<-unlist(strsplit(a,"B"))
a_list<-a_list[!a_list==""] #remove cases when 2 B are together
a_list<-table(a_list)
# MM MMM MMMM
# 2 2 1
Here's a solution that uses the dplyr package. First, I load the library and define my string.
library(dplyr)
a <- "BMMBMMMMBMMMBMMBBMMM"
Next, I define a function that counts the occurrences of character x in string y.
char_count <- function(x, y){
# Get runs of same character
tmp <- rle(strsplit(y, split = "")[[1]])
# Count runs of character stored in `x`
tmp <- data.frame(table(tmp$lengths[tmp$values == x]))
# Return strings and frequencies
tmp %>%
mutate(String = strrep(x, Var1)) %>%
select(String, Freq)
}
Then, I run the function.
# Run the function
res <- char_count("M", a)
# String Freq
# 1 M 2
# 2 MM 2
# 3 MMM 1
Finally, I define my value vector and calculate the total value of vector a.
# My value vector
value_vec <- c(M = 1, MM = 2, MMM = 3)
# Total `value` of vector `a`
sum(value_vec * res$Freq)
#[1] 9
It it's acceptable to skip the first step you could do:
nchar(gsub("(B+M)|(^M)","",a))
# [1] 9
First compute all diffrent patterns that appear in your sting :
a <- "BMMBMMMMBMMMBMMBBMMM"
chars = unlist(strsplit(a, ""))
pat = c()
for ( i in 1:length(chars)){
for (j in 1:(length(chars) - i+1)){ pat = c(pat, paste(chars[j:(j+i-1)], collapse = ""))}}
pat =sort(unique(pat))
pat[1:5] : [1] "B" "BB" "BBM" "BBMM" "BBMMM"
Next, count the occurence of each pattern :
counts = sapply(pat, function(w) length(gregexpr(w, a, fixed = TRUE)[[1]]))
Finally build a nice dataframe to summary everything up :
df = data.frame(counts = counts, num = 1:length(pat))
head(df, 10)
counts num
B 6 1
BB 1 2
BBM 1 3
BBMM 1 4
BBMMM 1 5
BM 5 6
BMM 5 7
BMMB 2 8
BMMBB 1 9
BMMBBM 1 10
library(stringr)
str_count(a, "MMMM")
gives 1
str_count(gsub("MMMM", "", a), "MMM") # now count how many times "MMM" occurs, but first delete the "MMMM"
gives 2
str_count(gsub("MMM", "", a), "MM") #now count how many times "MM" occurs, but first delete the "MMM"'s
gives 2
Let's say I have:
v = rep(c(1,2, 2, 2), 25)
Now, I want to count the number of times each unique value appears. unique(v) returns what the unique values are, but not how many they are.
> unique(v)
[1] 1 2
I want something that gives me
length(v[v==1])
[1] 25
length(v[v==2])
[1] 75
but as a more general one-liner :) Something close (but not quite) like this:
#<doesn't work right> length(v[v==unique(v)])
Perhaps table is what you are after?
dummyData = rep(c(1,2, 2, 2), 25)
table(dummyData)
# dummyData
# 1 2
# 25 75
## or another presentation of the same data
as.data.frame(table(dummyData))
# dummyData Freq
# 1 1 25
# 2 2 75
If you have multiple factors (= a multi-dimensional data frame), you can use the dplyr package to count unique values in each combination of factors:
library("dplyr")
data %>% group_by(factor1, factor2) %>% summarize(count=n())
It uses the pipe operator %>% to chain method calls on the data frame data.
It is a one-line approach by using aggregate.
> aggregate(data.frame(count = v), list(value = v), length)
value count
1 1 25
2 2 75
length(unique(df$col)) is the most simple way I can see.
table() function is a good way to go, as Chase suggested.
If you are analyzing a large dataset, an alternative way is to use .N function in datatable package.
Make sure you installed the data table package by
install.packages("data.table")
Code:
# Import the data.table package
library(data.table)
# Generate a data table object, which draws a number 10^7 times
# from 1 to 10 with replacement
DT<-data.table(x=sample(1:10,1E7,TRUE))
# Count Frequency of each factor level
DT[,.N,by=x]
To get an un-dimensioned integer vector that contains the count of unique values, use c().
dummyData = rep(c(1, 2, 2, 2), 25) # Chase's reproducible data
c(table(dummyData)) # get un-dimensioned integer vector
1 2
25 75
str(c(table(dummyData)) ) # confirm structure
Named int [1:2] 25 75
- attr(*, "names")= chr [1:2] "1" "2"
This may be useful if you need to feed the counts of unique values into another function, and is shorter and more idiomatic than the t(as.data.frame(table(dummyData))[,2] posted in a comment to Chase's answer. Thanks to Ricardo Saporta who pointed this out to me here.
This works for me. Take your vector v
length(summary(as.factor(v),maxsum=50000))
Comment: set maxsum to be large enough to capture the number of unique values
or with the magrittr package
v %>% as.factor %>% summary(maxsum=50000) %>% length
Also making the values categorical and calling summary() would work.
> v = rep(as.factor(c(1,2, 2, 2)), 25)
> summary(v)
1 2
25 75
You can try also a tidyverse
library(tidyverse)
dummyData %>%
as.tibble() %>%
count(value)
# A tibble: 2 x 2
value n
<dbl> <int>
1 1 25
2 2 75
If you need to have the number of unique values as an additional column in the data frame containing your values (a column which may represent sample size for example), plyr provides a neat way:
data_frame <- data.frame(v = rep(c(1,2, 2, 2), 25))
library("plyr")
data_frame <- ddply(data_frame, .(v), transform, n = length(v))
You can also try dplyr::count
df <- tibble(x=c('a','b','b','c','c','d'), y=1:6)
dplyr::count(df, x, sort = TRUE)
# A tibble: 4 x 2
x n
<chr> <int>
1 b 2
2 c 2
3 a 1
4 d 1
If you want to run unique on a data.frame (e.g., train.data), and also get the counts (which can be used as the weight in classifiers), you can do the following:
unique.count = function(train.data, all.numeric=FALSE) {
# first convert each row in the data.frame to a string
train.data.str = apply(train.data, 1, function(x) paste(x, collapse=','))
# use table to index and count the strings
train.data.str.t = table(train.data.str)
# get the unique data string from the row.names
train.data.str.uniq = row.names(train.data.str.t)
weight = as.numeric(train.data.str.t)
# convert the unique data string to data.frame
if (all.numeric) {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) as.numeric(unlist(strsplit(x, split=","))))))
} else {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) unlist(strsplit(x, split=",")))))
}
names(train.data.uniq) = names(train.data)
list(data=train.data.uniq, weight=weight)
}
I know there are many other answers, but here is another way to do it using the sort and rle functions. The function rle stands for Run Length Encoding. It can be used for counts of runs of numbers (see the R man docs on rle), but can also be applied here.
test.data = rep(c(1, 2, 2, 2), 25)
rle(sort(test.data))
## Run Length Encoding
## lengths: int [1:2] 25 75
## values : num [1:2] 1 2
If you capture the result, you can access the lengths and values as follows:
## rle returns a list with two items.
result.counts <- rle(sort(test.data))
result.counts$lengths
## [1] 25 75
result.counts$values
## [1] 1 2
count_unique_words <-function(wlist) {
ucountlist = list()
unamelist = c()
for (i in wlist)
{
if (is.element(i, unamelist))
ucountlist[[i]] <- ucountlist[[i]] +1
else
{
listlen <- length(ucountlist)
ucountlist[[i]] <- 1
unamelist <- c(unamelist, i)
}
}
ucountlist
}
expt_counts <- count_unique_words(population)
for(i in names(expt_counts))
cat(i, expt_counts[[i]], "\n")
I have some data that are in a code as battle ship game, like this:A0,A1,B0,B4,K12 and I want to transform these into coordinate points. The letter should be the x-coordinate and the number the y-coordinate. Besides that, I should transform the letters in numbers to multiply them. Like that:
A0 = 0 , 0;
A1 = 0 , 15;
A2 = 0 , 30;
B3 = 15 , 45
Here you go:
BattleshipConversion <- function(mystring)
{
return(c(which(LETTERS==substr(mystring,1,1))-1,as.integer(substr(mystring,2,3)))*15)
}
Result:
>BattleshipConversion("B1")
15 15
>BattleshipConversion("A10")
0 150
So what is happening above?
LETTERS is an R pre-generated vector of capital letters. which takes the index position of the letter in that vector, so which(LETTERS=='A') will give 1. We subtract 1 from that.
substr is a function that extracts a substring from a string, taking string, start and stop as arguments. counting starts with the first element, which in R is 1. substring(mystring,1,1) takes the first character element of mystring and stops there.
as.integer simply converts the 1-2 digit integer stored as character into a proper integer format.
we save it all in a combined vector using c(), and everything gets multiplied by 15, per the OP's specification
the function returns the result.
Note that this assumes your input string is correctly formatted. It will only work up to Z and 99, i.e. will fail on an AA14 or B101. You may want to add in some safeguards.
This is vectorized and can be extended to double letters easily:
fun <- function(s) {
x <- gsub("[[:digit:]]", "", s) #remove numbers
y <- gsub("[[:alpha:]]", "", s) #remove letters
x <- match(x, LETTERS) - 1 #match against letters
y <- as.integer(y)
cbind(x = x * 15, y = y * 15)
}
fun(c("A0", "A1", "A2", "B3"))
# x y
#[1,] 0 0
#[2,] 0 15
#[3,] 0 30
#[4,] 15 45
Say you have these positions:
pos<-c("A0","A1","A2","B3","K12")
You can:
require(data.table) #just to use tstrsplit
res<-setNames(as.data.frame(tstrsplit(pos,"(?<=[A-Z])",perl=TRUE),stringsAsFactors=FALSE),c("x","y"))
res[[1]]<-(match(res[[1]],LETTERS)-1)*15
res[[2]]<-as.numeric(res[[2]])*15
cbind(pos,res)
# pos x y
#1 A0 0 0
#2 A1 0 15
#3 A2 0 30
#4 B3 15 45
#5 K12 150 180
Here is a dplyr answer
library(dplyr)
library(tidyr)
library(rex)
template = rex(capture(letters),
capture(numbers) )
coordinates = c("A0","A1","B0","B4","K12")
letter_frame =
data_frame(LETTERS,
x_small = 1:26)
result =
data_frame(coordinate = coordinates) %>%
extract(coordinate, c("letter", "y_small"), template, convert = TRUE) %>%
left_join(letter_frame) %>%
mutate(x = x_small*15,
y = y_small*15)
BSconverter <- function(str){
let <- substr(str,1,1)
num <- as.integer(substr(str,2,nchar(str))) * 15
letnum <- (which(LETTERS==let)-1) * 15
c(letnum, num)
}
> BSconverter("K12")
[1] 150 180
Let's say I have:
v = rep(c(1,2, 2, 2), 25)
Now, I want to count the number of times each unique value appears. unique(v) returns what the unique values are, but not how many they are.
> unique(v)
[1] 1 2
I want something that gives me
length(v[v==1])
[1] 25
length(v[v==2])
[1] 75
but as a more general one-liner :) Something close (but not quite) like this:
#<doesn't work right> length(v[v==unique(v)])
Perhaps table is what you are after?
dummyData = rep(c(1,2, 2, 2), 25)
table(dummyData)
# dummyData
# 1 2
# 25 75
## or another presentation of the same data
as.data.frame(table(dummyData))
# dummyData Freq
# 1 1 25
# 2 2 75
If you have multiple factors (= a multi-dimensional data frame), you can use the dplyr package to count unique values in each combination of factors:
library("dplyr")
data %>% group_by(factor1, factor2) %>% summarize(count=n())
It uses the pipe operator %>% to chain method calls on the data frame data.
It is a one-line approach by using aggregate.
> aggregate(data.frame(count = v), list(value = v), length)
value count
1 1 25
2 2 75
length(unique(df$col)) is the most simple way I can see.
table() function is a good way to go, as Chase suggested.
If you are analyzing a large dataset, an alternative way is to use .N function in datatable package.
Make sure you installed the data table package by
install.packages("data.table")
Code:
# Import the data.table package
library(data.table)
# Generate a data table object, which draws a number 10^7 times
# from 1 to 10 with replacement
DT<-data.table(x=sample(1:10,1E7,TRUE))
# Count Frequency of each factor level
DT[,.N,by=x]
To get an un-dimensioned integer vector that contains the count of unique values, use c().
dummyData = rep(c(1, 2, 2, 2), 25) # Chase's reproducible data
c(table(dummyData)) # get un-dimensioned integer vector
1 2
25 75
str(c(table(dummyData)) ) # confirm structure
Named int [1:2] 25 75
- attr(*, "names")= chr [1:2] "1" "2"
This may be useful if you need to feed the counts of unique values into another function, and is shorter and more idiomatic than the t(as.data.frame(table(dummyData))[,2] posted in a comment to Chase's answer. Thanks to Ricardo Saporta who pointed this out to me here.
This works for me. Take your vector v
length(summary(as.factor(v),maxsum=50000))
Comment: set maxsum to be large enough to capture the number of unique values
or with the magrittr package
v %>% as.factor %>% summary(maxsum=50000) %>% length
Also making the values categorical and calling summary() would work.
> v = rep(as.factor(c(1,2, 2, 2)), 25)
> summary(v)
1 2
25 75
You can try also a tidyverse
library(tidyverse)
dummyData %>%
as.tibble() %>%
count(value)
# A tibble: 2 x 2
value n
<dbl> <int>
1 1 25
2 2 75
If you need to have the number of unique values as an additional column in the data frame containing your values (a column which may represent sample size for example), plyr provides a neat way:
data_frame <- data.frame(v = rep(c(1,2, 2, 2), 25))
library("plyr")
data_frame <- ddply(data_frame, .(v), transform, n = length(v))
You can also try dplyr::count
df <- tibble(x=c('a','b','b','c','c','d'), y=1:6)
dplyr::count(df, x, sort = TRUE)
# A tibble: 4 x 2
x n
<chr> <int>
1 b 2
2 c 2
3 a 1
4 d 1
If you want to run unique on a data.frame (e.g., train.data), and also get the counts (which can be used as the weight in classifiers), you can do the following:
unique.count = function(train.data, all.numeric=FALSE) {
# first convert each row in the data.frame to a string
train.data.str = apply(train.data, 1, function(x) paste(x, collapse=','))
# use table to index and count the strings
train.data.str.t = table(train.data.str)
# get the unique data string from the row.names
train.data.str.uniq = row.names(train.data.str.t)
weight = as.numeric(train.data.str.t)
# convert the unique data string to data.frame
if (all.numeric) {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) as.numeric(unlist(strsplit(x, split=","))))))
} else {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) unlist(strsplit(x, split=",")))))
}
names(train.data.uniq) = names(train.data)
list(data=train.data.uniq, weight=weight)
}
I know there are many other answers, but here is another way to do it using the sort and rle functions. The function rle stands for Run Length Encoding. It can be used for counts of runs of numbers (see the R man docs on rle), but can also be applied here.
test.data = rep(c(1, 2, 2, 2), 25)
rle(sort(test.data))
## Run Length Encoding
## lengths: int [1:2] 25 75
## values : num [1:2] 1 2
If you capture the result, you can access the lengths and values as follows:
## rle returns a list with two items.
result.counts <- rle(sort(test.data))
result.counts$lengths
## [1] 25 75
result.counts$values
## [1] 1 2
count_unique_words <-function(wlist) {
ucountlist = list()
unamelist = c()
for (i in wlist)
{
if (is.element(i, unamelist))
ucountlist[[i]] <- ucountlist[[i]] +1
else
{
listlen <- length(ucountlist)
ucountlist[[i]] <- 1
unamelist <- c(unamelist, i)
}
}
ucountlist
}
expt_counts <- count_unique_words(population)
for(i in names(expt_counts))
cat(i, expt_counts[[i]], "\n")