Counting function in R - r

I have a dataset like this
id <- 1:12
b <- c(0,0,1,2,0,1,1,2,2,0,2,2)
c <- rep(NA,3)
d <- rep(NA,3)
df <-data.frame(id,b)
newdf <- data.frame(c,d)
I want to do simple math. If x==1 or x==2 count them and write how many 1 and 2 are there in this dataset. But I don't want to count whole dataset, I want my function count them four by four.
I want to a result like this:
> newdf
one two
1 1 1
2 2 1
3 0 3
I tried this with lots of variation but I couldn't success.
afonk <- function(x) {
ifelse(x==1 | x==2, x, newdf <- (x[1]+x[2]))
}
afonk(newdf$one)
lapply(newdf, afonk)
Thanks in advance!
ismail

Fun with base R:
# counting function
countnum <- function(x,num){
sum(x == num)
}
# make list of groups of 4
df$group <- rep(1:ceiling(nrow(df)/4),each = 4)[1:nrow(df)]
dfl <- split(df$b,f = df$group)
# make data frame of counts
newdf <- data.frame(one = sapply(dfl,countnum,1),
two = sapply(dfl,countnum,2))
Edit based on comment:
# make list of groups of 4
df$group <- rep(1:ceiling(nrow(df)/4),each = 4)[1:nrow(df)]
table(subset(df, b != 0L)[c("group", "b")])
Which you prefer depends on what type of result you need. A table will work for a small visual count, and you can likely pull the data out of the table, but if it is as simple as your example, you might opt for the data.frame.

We could use dcast from data.table. Create a grouping variable using %/% and then dcast from 'long' to 'wide' format.
library(data.table)
dcast(setDT(df)[,.N ,.(grp=(id-1)%/%4+1L, b)],
grp~b, value.var='N', fill =0)[,c(2,4), with=FALSE]
Or a slightly more compact version would be using fun.aggregate as length.
res <- dcast(setDT(df)[,list((id-1)%/%4+1L, b)][b!=0],
V1~b, length)[,V1:=NULL][]
res
# 1 2
#1: 1 1
#2: 2 1
#3: 0 3
If we need the column names to be 'one', 'two'
library(english)
names(res) <- as.character(english(as.numeric(names(res))))

Related

Filter by ranges supplied by two vectors, without a join operation

I wish to do exactly this: Take dates from one dataframe and filter data in another dataframe - R
except without joining, as I am afraid that after I join my data the result will be too big to fit in memory, prior to the filter.
Here is sample data:
tmp_df <- data.frame(a = 1:10)
I wish to do an operation that looks like this:
lower_bound <- c(2, 4)
upper_bound <- c(2, 5)
tmp_df %>%
filter(a >= lower_bound & a <= upper_bound) # does not work as <= is vectorised inappropriately
and my desired result is:
> tmp_df[(tmp_df$a <= 2 & tmp_df$a >= 2) | (tmp_df$a <= 5 & tmp_df$a >= 4), , drop = F]
# one way to get indices to subset data frame, impractical for a long range vector
a
2 2
4 4
5 5
My problem with memory requirements (with respect to the join solution linked) is when tmp_df has many more rows and the lower_bound and upper_bound vectors have many more entries. A dplyr solution, or a solution that can be part of pipe is preferred.
Maybe you could borrow the inrange function from data.table, which
checks whether each value in x is in between any of the
intervals provided in lower,upper.
Usage:
inrange(x, lower, upper, incbounds=TRUE)
library(dplyr); library(data.table)
tmp_df %>% filter(inrange(a, c(2,4), c(2,5)))
# a
#1 2
#2 4
#3 5
If you'd like to stick with dplyr it has similar functionality provided through the between function.
# ranges I want to check between
my_ranges <- list(c(2,2), c(4,5), c(6,7))
tmp_df <- data.frame(a=1:10)
tmp_df %>%
filter(apply(bind_rows(lapply(my_ranges,
FUN=function(x, a){
data.frame(t(between(a, x[1], x[2])))
}, a)
), 2, any))
a
1 2
2 4
3 5
4 6
5 7
Just be aware that the argument boundaries are included by default and that cannot be changed as with inrange

Create combinations of measurements concatenated using underscore

I have a dataframe df1
ID <- c("A","B","C")
Measurement <- c("Length","Height","Breadth")
df1 <- data.frame(ID,Measurement)
I am trying to create combinations of measurements with an underscore between them and put it under the ID column "ALL"
Here is my desired output
ID Measurement
A Length
B Height
C Breadth
ALL Length_Height_Breadth
ALL Length_Breadth_Height
ALL Breadth_Height_Length
ALL Breadth_Length_Height
ALL Height_Length_Breadth
ALL Height_Breadth_Length
Also when there are similar measurements in the "measurement" column, I want to eliminate the underscore.
For example:
ID <- c("A","B")
Measurement <- c("Length","Length")
df2 <- data.frame(ID,Measurement)
Then I would want the desired output to be
ID Measurement
A Length
B Length
ALL Length
I am trying to do something like this which is totally wrong
df1$ID <- paste(df1$Measurement, df1$Measurement, sep="_")
Can someone point me in the right direction to achieving the above outputs?
I would like to see how it is done programmatically instead of using the actual measurement names. I am intending to apply the logic to a larger dataset that has several measurement names and so a general solution would be much appreciated.
We could use the permn function from the combinat package:
library(combinat)
sol_1 <- sapply(permn(unique(df1$Measurement)),
FUN = function(x) paste(x, collapse = '_'))
rbind.data.frame(df1, data.frame('ID' = 'All', 'Measurement' = sol_1))
# ID Measurement
# 1 A Length
# 2 B Height
# 3 C Breadth
# 4 All Length_Height_Breadth
# 5 All Length_Breadth_Height
# 6 All Breadth_Length_Height
# 7 All Breadth_Height_Length
# 8 All Height_Breadth_Length
# 9 All Height_Length_Breadth
sol_2 <- sapply(permn(unique(df2$Measurement)),
FUN = function(x) paste(x, collapse = '_'))
rbind.data.frame(df2, data.frame('ID' = 'All', 'Measurement' = sol_2))
# ID Measurement
# 1 A Length
# 2 B Length
# 3 All Length
Giving credit where credit is due: Generating all distinct permutations of a list.
We could also use permutations from the gtools package (HT #joel.wilson):
library(gtools)
unique_meas <- as.character(unique(df1$Measurement))
apply(permutations(length(unique_meas), length(unique_meas), unique_meas),
1, FUN = function(x) paste(x, collapse = '_'))
# "Breadth_Height_Length" "Breadth_Length_Height"
# "Height_Breadth_Length" "Height_Length_Breadth"
# "Length_Breadth_Height" "Length_Height_Breadth"

How to match Row and Row +1 Using apply R

I am attempting to replace an inefficient nested for loop that will not run on a large dataset with the apply function.
unique <- cbind.data.frame(c(1,2,3))
colnames(unique) <- "note"
ptSeensub <- rbind.data.frame(c(1,"a"), c(1,"b"), c(2,"a"), c(2,"d"), c(3,"e"), c(3,"f"))
colnames(ptSeenSub) <- c("PARENT_EVENT_ID", "USER_NAME")
uniqueRow <- nrow(unique)
ptSeenSubRow <- nrow(ptSeenSubRow)
for (note in 1:uniqueRow)
{
for (row in 1:ptSeenSubRow)
{
if (ptSeenSub$PARENT_EVENT_ID[row] == unique$note[note])
{
unique$attending_name[note] <- ptSeenSub$USER_NAME[row]
unique$attending_name[note] <- ptSeenSub$USER_NAME[row +1]
}
}
}
I would like the results to be similar to this dataframe:
results <- rbind.data.frame(c(1, "a", "b"), c(2, "a", "d"), c(3,"e", "f"))
colnames(results) <- c("note", "attending_name", "resident_name")
The loop will be running over millions of rows and will not finish. How can I vectorize this to finish over large data sets? Any advice is greatly apprecaited
Sounds like you are trying to reshape data into wide format. I find that dplyr and tidyr find nice tools to accomplish this.
define data
library(tidyr)
library(dplyr)
ptSeenSub <- rbind.data.frame(c(1,"a"), c(1,"b"), c(2,"a"), c(2,"d"), c(3,"e"), c(3,"f"))
reshape
result <- ptSeenSub %>%
group_by(PARENT_EVENT_ID) %>%
mutate(k = row_number()) %>%
spread(k, USER_NAME)
You can then change names if you wish:
names(result) <- c("notes", "attending_name", "resident_name")
You could also use dcast from either reshape2 or the devel version of data.table (should be fast) i.e. v1.9.5
library(data.table)
setnames(dcast(setDT(ptSeensub)[, N:= 1:.N, PARENT_EVENT_ID],
PARENT_EVENT_ID~N, value.var='USER_NAME'),
c('note', 'attending_name', 'resident_name'))[]
# note attending_name resident_name
#1: 1 a b
#2: 2 a d
#3: 3 e f
If there are only two observations per each 'PARENT_EVENT_ID'
setDT(ptSeensub)[,.(attending_name=USER_NAME[1L],
resident_name=USER_NAME[2L]) , .(note=PARENT_EVENT_ID)]
# note attending_name resident_name
#1: 1 a b
#2: 2 a d
#3: 3 e f

Matching vector values by records in a data frame in R

I have a vector of values r as follows:
r<-c(1,3,4,6,7)
and a data frame df with 20 records and two columns:
id<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,1,4,15,16,17,18,19,20)
freq<-c(1,3,2,4,5,6,6,7,8,3,3,1,6,9,9,1,1,4,3,7,7)
df<-data.frame(id,freq)
Using the r vector I need to extract a sample of records (in the form of a new data frame) from df in a way that the freq values of the records, would be equal to the values I have in my r vector. Needless to say that if it finds multiple records with the same freq values it should randomly pick one of them. For instance one possible outcome can be:
id frequency
12 1
10 3
4 4
7 6
8 7
I would be thankful if anyone could help me with this.
You could try data.table
library(data.table)
setDT(df)[freq %in% r,sample(id,1L) , freq]
Or using base R
aggregate(id~freq, df, subset=freq %in% r, FUN= sample, 1L)
Update
If you have a vector "r" with duplicate values and want to sample the data set ('df') based on the length of unique elements in 'r'
r <-c(1,3,3,4,6,7)
res <- do.call(rbind,lapply(split(r, r), function(x) {
x1 <- df[df$freq %in% x,]
x1[sample(1:nrow(x1),length(x), replace=FALSE),]}))
row.names(res) <- NULL
You can use filter and sample_n from "dplyr":
library(dplyr)
set.seed(1)
df %>%
filter(freq %in% r) %>%
group_by(freq) %>%
sample_n(1)
# Source: local data frame [5 x 2]
# Groups: freq
#
# id freq
# 1 12 1
# 2 10 3
# 3 17 4
# 4 13 6
# 5 8 7
Have you tried using the match() function or %in%? This might not be a fast/clean solution, but uses only base R functions:
rUnique <- unique(r)
df2 <- df[df$freq %in% rUnique,]
x <- data.frame(id = NA, freq = rUnique)
for (i in 1:length(rUnique)) {
x[i,1] <- sample(df2[df2[, 2] == rUnique[i], 1], 1)
}
print(x)

How I can select rows from a dataframe that do not match?

I'm trying to identify the values in a data frame that do not match, but can't figure out how to do this.
# make data frame
a <- data.frame( x = c(1,2,3,4))
b <- data.frame( y = c(1,2,3,4,5,6))
# select only values from b that are not in 'a'
# attempt 1:
results1 <- b$y[ !a$x ]
# attempt 2:
results2 <- b[b$y != a$x,]
If a = c(1,2,3) this works, as a is a multiple of b. However, I'm trying to just select all the values from data frame y, that are not in x, and don't understand what function to use.
If I understand correctly, you need the negation of the %in% operator. Something like this should work:
subset(b, !(y %in% a$x))
> subset(b, !(y %in% a$x))
y
5 5
6 6
Try the set difference function setdiff. So you would have
results1 = setdiff(a$x, b$y) # elements in a$x NOT in b$y
results2 = setdiff(b$y, a$x) # elements in b$y NOT in a$x
You could also use dplyr for this task. To find what is in b but not a:
library(dplyr)
anti_join(b, a, by = c("y" = "x"))
# y
# 1 5
# 2 6

Resources