Implement table() function as a user defined function - r

x <- c(1,2,3,2,1)
table(x)
# x
# 1 2 3
# 2 2 1
Outputs how many times each element occur in the vector.
I am trying to imitate the above function using function()
Below is my code:
TotalTimes = function(x){
times = 0
y = unique(x)
for (i in 1:length(y)) {
for (i in 1:length(x)) {
if(y[i] == x[i])
times = times + 1
}
return(times)
}
}
What would be the right approach?

Here's a one-liner, using rle():
f <- function(x) {
with(rle(sort(x)), setNames(lengths, values))
}
f(c(1,2,3,2,1))
# 1 2 3
# 2 2 1
Alternatively, here's an option that's less "tricky", and is probably a better model for learning to code in an R-ish way:
f2 <- function(x) {
ss <- sort(x)
uu <- unique(ss)
names(uu) <- uu
sapply(uu, function(u) sum(ss == u))
}
f2(c(1,2,3,2,1))
# 1 2 3
# 2 2 1

function(x) {
q = sapply(unique(x), function(i) sum(x == i))
names(q) = unique(x)
return(q)
}

Here is one method using base R:
# data
x <- c(1,2,3,2,1)
# set up
y <- sort(unique(x))
counts <- rep_len(0, length.out=length(y))
names(counts) <- y
for(i in seq_along(x)) {
counts[x[i] == y] <- counts[x[i] == y] + 1
}
Wrapping it in a function:
table2 <- function(x) {
# transform x into character vector to reduce search cost in loop
x <- as.character(x)
y <- sort(unique(x))
counts <- rep_len(0, length.out=length(y))
names(counts) <- y
for(i in seq_along(x)) {
counts[x[i]] <- counts[x[i]] + 1L
}
return(counts)
}
This version only accepts a single vector, of course. At #Frank's suggestion, the function version is slightly different, and possibly faster, in that it transforms the input x into a character. The potential speed up is in the search in counts[x[i]] where the name in counts is referred to (as x[i]), rather than performing a search using "==."

Related

Error in if (a[i][j] > 4) { : missing value where TRUE/FALSE needed

Find the number of entries in each row which are greater than 4.
set.seed(75)
aMat <- matrix( sample(10, size=60, replace=T), nr=6)
rowmax=function(a)
{
x=nrow(a)
y=ncol(a)
i=1
j=1
z=0
while (i<=x) {
for(j in 1:y) {
if(!is.na(a[i][j])){
if(a[i][j]>4){
z=z+1
}
}
j=j+1
}
print(z)
i=i+1
}
}
rowmax(aMat)
It is showing the error. I don't want to apply in built function
You could do this easier counting the x that are greater than 4 using length.
rowmax2 <- function(x) apply(x, 1, function(x) {x <- na.omit(x);length(x[x > 4])})
rowmax2(aMat)
# [1] 8 7 8 7 4 3
If you wanted to do this absolutely without any shortcut you could use two for loops. 1 for each row and another for each value in the row.
rowmax = function(a) {
y=nrow(a)
result <- numeric(y)
for(j in seq_len(y)) {
count = 0
for(val in a[j, ]) {
if(!is.na(val) && val > 4)
count = count + 1
}
result[j] <- count
}
return(result)
}
rowmax(aMat)
#[1] 8 7 8 7 4 3
If you wanted to do this using in-built functions in base R you could use rowSums.
rowSums(aMat > 4, na.rm = TRUE)
#[1] 8 7 8 7 4 3
There are several errors in you code:
You should put z <- 0 inside while loop
You should use a[i,j] for the matrix indexing, rather than a[i][j]
Below is a version after fixing the problems
rowmax <- function(a) {
x <- nrow(a)
y <- ncol(a)
i <- 1
j <- 1
while (i <= x) {
z <- 0
for (j in 1:y) {
if (!is.na(a[i, j])) {
if (a[i, j] > 4) {
z <- z + 1
}
}
j <- j + 1
}
print(z)
i <- i + 1
}
}
and then we get
> rowmax(aMat)
[1] 8
[1] 7
[1] 8
[1] 7
[1] 4
[1] 3
A concise approach to make it is using rowSums, e.g.,
rowSums(aMat, na.rm = TRUE)

Error argument is of length zero occurs when creating my own sorting function

*> csort <- function(c){
i<-1
for (i in 1:length(c)-1) {
j <- i+1
for (j in 2:length(c)) {
if(c[i] >= c[j])c[c(i,j)] <- c[c(j,i)]
j = j + 1
}
i = i + 1
}
}
> csort(a)
Error in if (c[i] >= c[j]) c[c(i, j)] <- c[c(j, i)] :
argument is of length zero*
This is what RStudio do when I run it. I do not know what cause the zero here.
csort <- function(c){
p <- 1
povit <- c[1]
c <- c[-1]
left <- c()
right <- c()
left <- c[which(c <= povit)]
right <- c[which(c > povit)]
if(length(left) > 1){
left <- csort(left)
}
if(length(right) > 1){
right <- csort(right)
}
return(c(left ,povit,right))
}
I viewed more about sorting online and this is a pivot sort way.
your mistake is in this line
for (i in 1:length(c)-1)
and should be
for (i in 1:(length(c)-1))
since $:$ operator precedes $-$.
an example is
1:(5-1)
#[1] 1 2 3 4
1:5-1
#[1] 0 1 2 3 4
so error happen in index with Zero value.
csort <- function(d){
for (i in 1:(length(d)-1)) {
for (j in (i+1):length(d)) {
if(d[i] >= d[j])d[c(i,j)] <- d[c(j,i)]
}
}
return(d)
}
d<-c(5:1,-1:3,-9,-3,10,9,-20,1,20,-6,5)
any((csort(d)==sort(d))==F)
#[1] FALSE
you can improve this function.

Apply Function Across Columns in data.table with do.call and .SD in R

I'm trying to create a variant of pmax / pmin that works with an additional filter_value parameter across an arbitrary set of columns that would be defined using .SD / .SDcols. The first version of the function below hard-codes the filter value, but works with .SD:
testFuncV1 <- function(...) {
cols <- list(...)
num_cols <- length(cols)
num_records <- length(cols[[1]])
max_records <- c()
for (record_num in 1:num_records) {
v <- c()
for (l in cols) {
v <- c(v, l[[record_num]])
}
filt_v <- Filter(function(x) { x <= 1 }, v)
if (length(filt_v) == 0) {
max_records <- c(max_records, NA)
} else {
max_records <- c(max_records, max(filt_v))
}
}
max_records
}
test_dt_v1 <- data.table(a = c(1,3,5), b = c(2,3,-1), c = c(-3, 5, 2))
test_dt_v1[, max_with_filter := do.call(testFuncV1, .SD), .SDcols = c('a', 'b', 'c')]
returns:
a b c max_with_filter
1: 1 2 -3 1
2: 3 3 5 NA
3: 5 -1 2 -1
The second version of the function below takes a second filter parameter, but I was not able to get it to work with .SD, and rather, had to pass the individual column vectors in as a list to get things to work:
testFuncV2 <- function(cols, filter) {
num_cols <- length(cols)
num_records <- length(cols[[1]])
max_records <- c()
for (record_num in 1:num_records) {
v <- c()
for (l in cols) {
v <- c(v, l[[record_num]])
}
filt_v <- Filter(function(x) { x <= filter }, v)
if (length(filt_v) == 0) {
max_records <- c(max_records, NA)
} else {
max_records <- c(max_records, max(filt_v))
}
}
max_records
}
test_dt_v2 <- data.table(a = c(1,3,5), b = c(2,3,-1), c = c(-3, 5, 2))
test_dt_v2[, max_with_filter := do.call(testFuncV2, list(list(test_dt_v2$a, test_dt_v2$b, test_dt_v2$c), 1))]
also returns:
a b c max_with_filter
1: 1 2 -3 1
2: 3 3 5 NA
3: 5 -1 2 -1
Ideally, I'd be able to either figure out an approach that works with .SD using do.call, or substitute in something that works with lapply (which I also experimented around with, to no avail). Thanks in advance!
Here is an option using apply(MARGIN=1, ...)
func <- function(x, threshold) {
if (any(x <= threshold)) return(max(x[x <= threshold]))
NA
}
test_dt_v1[, max_with_filter := apply(.SD, 1, func, threshold=1),
.SDcols=c("a","b","c")]
Another option using do.call and pmax by converting values above 1 to NA first (idea came from rowwise maximum for R)
test_dt_v1[, max_with_filter := do.call(pmax, c(`is.na<-`(.SD, .SD>1), na.rm=T))]

Find non-first match in R

I know that match(x,y) returns the first match of all elements of x in y.
Assuming that x may contain the same value multiple time, I am looking for a concise way to match the nth occurrence in x with the nth occurrence in y.
For example: `
x <- c(3,4,4,3,2,4)
y <- c(1,2,3,4,1,2,3,4)
my.match(x, y)
## 3,4,8,7,2,NA
Using a for loop to match, store and overwrite a match with NA.
idx <- c()
for (i in x) {
k <- match(i, y)
idx <- c(idx, k)
y[k] <- NA
}
idx
#[1] 3 4 8 7 2 NA
The following function is much faster when vectors are large because it does not iterate over the whole vector
my.match <- function(x,y){
fidx <- rep(FALSE,length(x))
fidy <- rep(FALSE,length(y))
ret <- rep(NA,length(x))
repeat{
nidx <- which(!fidx)
nidy <- which(!fidy)
idx <- match(x[nidx],y[nidy])
idy <- match(y[nidy],x[nidx])
ret[nidx] <- nidy[idx]
fidx[nidx[unique(idy)]] <- TRUE
fidy[nidy[unique(idx)]] <- TRUE
if(sum(!is.na(idx))==0 | sum(!is.na(idy))==0){
break
}
}
return(ret)
}
Benchmarking with the other proposed method yields:
my.match1 <- function(x,y){
idx <- c()
for (i in x) {
k <- match(i, y)
idx <- c(idx, k)
y[k] <- NA
}
return(idx)
}
x <- sample.int(100,10000,replace=T)
y <- sample.int(100,10000,replace=T)
system.time(my.match1(x,y))
## user system elapsed
## 1.016 0.003 1.020
system.time(my.match(x,y))
## user system elapsed
## 0.049 0.000 0.049

Count occurrences of condition in lapply

I am running a simulation that I need to keep track of number of occurrences in a function call of a particular condition. I attempted to accomplish this with an assignment to a global object. It works if you run the function but if you try to lapply the function as I'm doing then you get a single count of all the times the condition happened rather than a count for every time it happened for each element in the list fed to lapply.
Here's a dummy situation where the occurrence is evenness of a number:
FUN <- function(x){
lapply(1:length(x), function(i) {
y <- x[i]
if (y %% 2 == 0){
assign("count.occurrences", count.occurrences + 1, env=.GlobalEnv)
}
print("do something")
})
list(guy="x", count=count.occurrences)
}
#works as expected
count.occurrences <- 0
FUN(1:10)
count.occurrences <- 0
lapply(list(1:10, 1:3, 11:16, 9), FUN)
#gives me...
#> count.occurrences
#[1] 9
#I want...
#> count.occurrences
#[1] 5 1 3 0
It's in a simulation so speed is an issue. I want this to be as fast as possible so I'm not married to the global assignment idea.
Rather than assign to the global environment, why not just assign to inside FUN's environment?
FUN <- function(x){
count.occurances <- 0
lapply(1:length(x), function(i) {
y <- x[i]
if (y %% 2 == 0){
count.occurances <<- count.occurances + 1
}
print("do something")
})
list(guy="x", count=count.occurances)
}
Z <- lapply(list(1:10, 1:3, 11:16, 9), FUN)
Then you can just pull the counts out.
> sapply(Z, `[[`, "count")
[1] 5 1 3 0
I haven't done any benchmarking on this, but have you tried just using a for loop? I know that loops aren't generally encouraged in R, but they're also not always slower.
FUN <- function(x) {
count.occurrences = 0
for (i in 1:length(x)) {
y = x[i]
if (y %% 2 == 0) {
count.occurrences = count.occurrences + 1
}
print("do something")
}
list(guy="x", count=count.occurrences)
}
lapply(list(1:10, 1:3, 11:16, 9), FUN)
I can get it like this:
count.occurances <- 0
Z <-lapply(list(1:10, 1:3, 11:16, 9), FUN)
diff(c(0, sapply(1:length(Z), function(x) Z[[x]]$count)))
I'm open to better ideas (faster).

Resources