Subsetting data with a vec - r

I want to subset a data frame by a vector, but replicate the subsetting for each value in the vector:
data = data.frame(A = c(1,2,3,1), B = c(1,2,3,4))
vec = c(1, 1, 1)
subset(data, A %in% vec)
A B
1 1 1
4 1 4
Instead of this result I want this:
A B
1 1 1
4 1 4
1 1 1
4 1 4
1 1 1
4 1 4

If you use the purrr library, you can do
map_df(vec, function(x) subset(data, A == x))
with base R, it would be
do.call("rbind", lapply(vec, function(x) subset(data, A == x)))

You need to expand it, i.e.
df2 <- subset(data, A %in% vec)
df2[rep(rownames(df2), length(vec)),]
# A B
#1 1 1
#4 1 4
#1.1 1 1
#4.1 1 4
#1.2 1 1
#4.2 1 4

One option with data.table:
library(data.table)
setDT(data, key = 'A')[.(vec)]
# A B
#1: 1 1
#2: 1 4
#3: 1 1
#4: 1 4
#5: 1 1
#6: 1 4
Or use merge, which gives cartesian product as you need when there are duplicated values in the merge-by column:
merge(data, data.frame(A = vec))
# A B
#1: 1 1
#2: 1 1
#3: 1 1
#4: 1 4
#5: 1 4
#6: 1 4

Along the lines of a base R split-apply-combine solution would be
do.call(rbind, lapply(vec, function(i) data[data$A == i, ]))
A B
1 1 1
4 1 4
11 1 1
41 1 4
12 1 1
42 1 4
This could be useful if vec contained an uneven mixture of values. This solution could be expensive if there are many repetitions in vec. In that instance, computation can be reduced by combining it with the rep idea in soto's answer as follows.
# count the number of repetitions by unique value
uni <- table(vec)
# extract unique values
temp <- lapply(as.numeric(names(uni)), function(i) data[data$A == i, ])
# combine results, repeating data.frames according to count
do.call(rbind, temp[rep(seq_along(uni), each=uni)])

Related

Summary of values across rows and columns in R

I have a dataset that looks like:
Group A B C D
XYZ 4 Na 1 3
XYZ Na 2 2 1
DEF 4 3 2 1
DEF 3 3 1 1
PQR 1 Na Na 1
PQR 3 2 2 4
I want the summary of this dataset across rows and columns for the count of each value as below:
Group 4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
The count of 4 in the dataset for group XYZ across all rows and columns is 1, for 2 and 1 its 2, for 3 its 1. I can do this by creating 4 new columns 4,3,2,1 and getting the count row wise and then column wise, but this is not efficient and scalable. I am sure there is a better way to get this done.
Using reshape2 package we can melt and dcast as follows,
library(reshape2)
dcast(na.omit(melt(df, id.vars = 'Group')), Group ~ value, fun.aggregate = length)
# Group 1 2 3 4
#1 DEF 3 1 3 1
#2 PQR 2 2 1 1
#3 XYZ 2 2 1 1
This uses no packages and is just one line. Here DF$Group[row(DF[-1])] is a Group labels vector such that each element corresponds to the unravelled numeric vector unlist(DF[-1]).
table(DF$Group[row(DF[-1])], unlist(DF[-1]))
giving:
1 2 3 4
DEF 3 1 3 1
PQR 2 2 1 1
XYZ 2 2 1 1
If the order of rows and columns shown in the question is important then to we can create factors from each of the two table arguments with the factor levels being defined in the orders desired. In this case we use the following line instead of the line of code above:
table(Group = factor(DF$Group[row(DF[-1])], unique(DF$Group)), factor(unlist(DF[-1]), 4:1))
giving:
Group 4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
The above produces an object of class "table". This is a particularly suitable class for tabulated frequencies. For example, once in this form ftable can be used to easily rearrange it further as in ftable(tab, row.vars = 2) or ftable(tab, row.vars = 1:2) where tab is the above computed table.
If a data.frame were preferred then convert it like this:
cbind(Group = rownames(tab), as.data.frame.matrix(tab))
The input data.frame DF is defined reproducibly in Note 2 at the end.
Alternatives
Although the above seems the most direct here are some other alternatives that also use no packages:
1) by For each set of rows having the same Group value the anonymous function creates a data.frame identifying the Group, converting the columns other than the first to a factor with the indicated levels and running table to get the counts. The "by" list that is returned is sorted back to the original order and we rbind everything back together.
do.call("rbind",
by(DF, DF$Group, function(x) {
data.frame(Group = x[1,1],
as.list(table(factor(unlist(x[, -1]), levels = 4:1))),
check.names = FALSE)
})[unique(DF$Group)])
giving:
Group 4 3 2 1
XYZ XYZ 1 1 2 2
DEF DEF 1 3 1 3
PQR PQR 1 1 2 2
1a) This slightly shorter variation would also work. It returns a matrix identifying the groups using row names.
kount <- function(x) table(factor(unlist(x), levels = 4:1))
m <- do.call("rbind", by(DF[, -1], DF$Group, kount)[unique(DF$Group)])
giving:
> m
4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
2) outer
gps <- unique(DF$Group)
levs <- 4:1
kount2 <- function(g, lv) sum(subset(DF, Group == g)[-1] == lv, na.rm = TRUE)
m <- outer(gps, levs, Vectorize(kount2))
dimnames(m) <- list(gps, levs))
giving this matrix:
> m
4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
3) sapply
kount3 <- function(g) table(factor(unlist(DF[DF$Group == g, -1]), levels = 4:1))
gps <- as.character(unique(DF$Group))
do.call("rbind", sapply(gps, kount3, simplify = FALSE))
giving:
4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
4) aggregate
aggregate(1:nrow(DF), DF["Group"], function(ix)
table(factor(unlist(DF[ix, -1]), levels = 4:1)))[unique(DF$Group), ]
giving:
Group x.4 x.3 x.2 x.1
3 XYZ 1 1 2 2
1 DEF 1 3 1 3
2 PQR 1 1 2 2
5) tapply
do.call("rbind", tapply(1:nrow(DF), DF$Group, function(ix)
table(factor(unlist(DF[ix, -1]), levels = 4:1))))[unique(DF$Group), ]
6) reshape
with(reshape(DF, dir = "long", varying = list(2:5)),
table(factor(Group, unique(DF$Group)), factor(A, 4:1)))
giving:
4 3 2 1
XYZ 1 1 2 2
DEF 1 3 1 3
PQR 1 1 2 2
Note 1: (1a), (2), (3), (5) and (6) produce a matrix or table result with groups as row names. If you prefer a data frame with Groups as a column then supposing that m is the matrix, add this:
data.frame(Group = rownames(m), m, check.names = FALSE)
Note 2: The input DF in reproducible form is:
Lines <- "Group A B C D
XYZ 4 Na 1 3
XYZ Na 2 2 1
DEF 4 3 2 1
DEF 3 3 1 1
PQR 1 Na Na 1
PQR 3 2 2 4"
DF <- read.table(text = Lines, header = TRUE, na.strings = "Na")
We can use dplyr/tidyr
library(dplyr)
library(tidyr)
df1 %>%
mutate_each(funs(replace(., .=="Na", NA))) %>%
gather(Var, Val, A:D, na.rm=TRUE) %>%
group_by(Group, Val) %>%
tally() %>%
spread(Val, n)
# Group `1` `2` `3` `4`
#* <chr> <int> <int> <int> <int>
#1 DEF 3 1 3 1
#2 PQR 2 2 1 1
#3 XYZ 2 2 1 1

How can I subset a dataframe according to group membership?

I am wanting to write a function so that a (potentially large) dataframe can be subsetted according to group membership, where a 'group' is a unique combination of a set of column values.
For example, I would like to subset the following data frame according to unique combination of the first two columns (Loc1 and Loc2).
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF=data.frame(Loc1,Loc2,Dat1,Dat2,Dat3)
Loc1 Loc2 Dat1 Dat2 Dat3
1 A a 1 1 2
2 A a 1 2 2
3 A b 1 1 4
4 A b 1 2 4
5 B a 1 1 6
6 B a 1 2 5
7 B b 1 2 3
I want to return (i) the number of groups (i.e. 4), (ii) the number in each group (i.e. c(2,2,2,1), and (iii) to relabel the rows so that I can further analyse the data frame according to group membership (e.g. for ANOVA and MANOVA) (i.e.
Group<-as.factor(c(1,1,2,2,3,3,4))
Data <- cbind(Group,DF[,-1:-2])
Group Dat1 Dat2 Dat3
1 1 1 1 2
2 1 1 2 2
3 2 1 1 4
4 2 1 2 4
5 3 1 1 6
6 3 1 2 5
7 4 1 2 3
).
So far all I have managed is to get the number of groups, and I'm suspicious that there's a better way to do even this:
nrow(unique(DF[,1:2]))
I was hoping to avoid for-loops as I am concerned about the function being slow.
I have tried converting to a data matrix so that I could concatenate the row values but I couldn't get that to work either.
Many thanks
You could try:
Create Group column by using unique level combination of Loc1 and Loc2.
indx <- paste(DF[,1], DF[,2])
DF$Group <- as.numeric(factor(indx, unique(indx))) #query No (iii)
DF1 <- DF[-(1:2)][,c(4,1:3)]
# Group Dat1 Dat2 Dat3
#1 1 1 1 2
#2 1 1 2 2
#3 2 1 1 4
#4 2 1 2 4
#5 3 1 1 6
#6 3 1 2 5
#7 4 1 2 3
table(DF$Group) #(No. ii)
#1 2 3 4
#2 2 2 1
length(unique(DF$Group)) #(i)
#[1] 4
Then, if you need to subset the datasets by group, you could split the dataset using the Group to create a list of 4 list elements
split(DF1, DF1$Group)
Update
If you have multiple columns, you could still try:
ColstoGroup <- 1:2
indx <- apply(DF[,ColstoGroup], 1, paste, collapse="")
as.numeric(factor(indx, unique(indx)))
#[1] 1 1 2 2 3 3 4
You could create a function;
fun1 <- function(dat, GroupCols){
FactGroup <- dat[, GroupCols]
if(length(GroupCols)==1){
dat$Group <- as.numeric(factor(FactGroup, levels=unique(FactGroup)))
}
else {
indx <- apply(FactGroup, 1, paste, collapse="")
dat$Group <- as.numeric(factor(indx, unique(indx)))
}
dat
}
fun1(DF, "Loc1")
fun1(DF, c("Loc1", "Loc2"))
This gets all three of your queries.
Begin with a table of the first two columns and then work with that data.
> (tab <- table(DF$Loc1, DF$Loc2))
#
# a b
# A 2 2
# B 2 1
#
> (ct <- c(tab)) ## (ii)
# [1] 2 2 2 1
> length(unlist(dimnames(tab))) ## (i)
# [1] 4
> cbind(Group = rep(seq_along(ct), ct), DF[-c(1,2)]) ## (iii)
# Group Dat1 Dat2 Dat3
# 1 1 1 1 2
# 2 1 1 2 2
# 3 2 1 1 4
# 4 2 1 2 4
# 5 3 1 1 6
# 6 3 1 2 5
# 7 4 1 2 3
Borrowing a bit from this answer and using some dplyr idioms:
library(dplyr)
Loc1 <- c("A","A","A","A","B","B","B")
Loc2 <- c("a","a","b","b","a","a","b")
Dat1 <- c(1,1,1,1,1,1,1)
Dat2 <- c(1,2,1,2,1,2,2)
Dat3 <- c(2,2,4,4,6,5,3)
DF <- data.frame(Loc1, Loc2, Dat1, Dat2, Dat3)
emitID <- local({
idCounter <- -1L
function(){
idCounter <<- idCounter + 1L
}
})
DF %>% group_by(Loc1, Loc2) %>% mutate(Group=emitID())
## Loc1 Loc2 Dat1 Dat2 Dat3 Group
## 1 A a 1 1 2 0
## 2 A a 1 2 2 0
## 3 A b 1 1 4 1
## 4 A b 1 2 4 1
## 5 B a 1 1 6 2
## 6 B a 1 2 5 2
## 7 B b 1 2 3 3

Remover observations for which there is not a duplicate

I would like to break a dataset into two frames - one for which the original dataset has duplicate observations based on a condition and one for which the original dataset does not have duplicate observations based on a condition. In the following example, I would like to break the frame into one for which there is only one coder for an observation and one for which there are two coders::
frame <- data.frame(id = c(1,1,1,2,2,3), coder = c("A", "A", "B", "A", "B", "A"), y = c(4,5,4,1,1,2))
frame
For this, I would like to produce, such that:
frame1:
id coder y
1 1 A 4
2 1 A 5
3 1 B 4
4 2 A 1
5 2 B 1
frame2:
6 3 A 2
You can use aggregate to determine the ids you want in each data frame:
cts <- aggregate(coder~id, frame, function(x) length(unique(x)))
cts
# id coder
# 1 1 2
# 2 2 2
# 3 3 1
Then you can subset as appropriate based on this:
subset(frame, id %in% cts$id[cts$coder >= 2])
# id coder y
# 1 1 A 4
# 2 1 A 5
# 3 1 B 4
# 4 2 A 1
# 5 2 B 1
subset(frame, id %in% cts$id[cts$coder < 2])
# id coder y
# 6 3 A 2
You may also try:
indx <- !colSums(!table(frame$coder, frame$id))
frame[frame$id %in% names(indx)[indx],]
# id coder y
#1 1 A 4
#2 1 A 5
#3 1 B 4
#4 2 A 1
#5 2 B 1
frame[frame$id %in% names(indx)[!indx],]
# id coder y
#6 3 A 2
Explanation
table(frame$coder, frame$id)
# 1 2 3
# A 2 1 1
# B 1 1 0 #Here for id 3, B==0
If we Negate that, the result would be a logical index
!table(frame$coder, frame$id).
Do the colSums of the above, which results
# 1 2 3
# 0 0 1
Negate again and get the index for ids and subset those ids which are TRUE
From this you can subset by matching with the names of the ids

Create counter with multiple variables [duplicate]

This question already has answers here:
Numbering rows within groups in a data frame
(10 answers)
Closed 6 years ago.
I have my data that looks like below:
CustomerID TripDate
1 1/3/2013
1 1/4/2013
1 1/9/2013
2 2/1/2013
2 2/4/2013
3 1/2/2013
I need to create a counter variable, which will be like below:
CustomerID TripDate TripCounter
1 1/3/2013 1
1 1/4/2013 2
1 1/9/2013 3
2 2/1/2013 1
2 2/4/2013 2
3 1/2/2013 1
Tripcounter will be for each customer.
Use ave. Assuming your data.frame is called "mydf":
mydf$counter <- with(mydf, ave(CustomerID, CustomerID, FUN = seq_along))
mydf
# CustomerID TripDate counter
# 1 1 1/3/2013 1
# 2 1 1/4/2013 2
# 3 1 1/9/2013 3
# 4 2 2/1/2013 1
# 5 2 2/4/2013 2
# 6 3 1/2/2013 1
For what it's worth, I also implemented a version of this approach in a function included in my "splitstackshape" package. The function is called getanID:
mydf <- data.frame(IDA = c("a", "a", "a", "b", "b", "b", "b"),
IDB = c(1, 2, 1, 1, 2, 2, 2), values = 1:7)
mydf
# install.packages("splitstackshape")
library(splitstackshape)
# getanID(mydf, id.vars = c("IDA", "IDB"))
getanID(mydf, id.vars = 1:2)
# IDA IDB values .id
# 1 a 1 1 1
# 2 a 2 2 1
# 3 a 1 3 2
# 4 b 1 4 1
# 5 b 2 5 1
# 6 b 2 6 2
# 7 b 2 7 3
As you can see from the example above, I've written the function in such a way that you can specify one or more columns that should be treated as ID columns. It checks to see if any of the id.vars are duplicated, and if they are, then it generates a new ID variable for you.
You can also use plyr for this (using #AnadaMahto's example data):
> ddply(mydf, .(IDA), transform, .id = seq_along(IDA))
IDA IDB values .id
1 a 1 1 1
2 a 2 2 2
3 a 1 3 3
4 b 1 4 1
5 b 2 5 2
6 b 2 6 3
7 b 2 7 4
or even:
> ddply(mydf, .(IDA, IDB), transform, .id = seq_along(IDA))
IDA IDB values .id
1 a 1 1 1
2 a 1 3 2
3 a 2 2 1
4 b 1 4 1
5 b 2 5 1
6 b 2 6 2
7 b 2 7 3
Note that plyr does not have a reputation for being the quickest solution, for that you need to take a look at data.table.
Here's a data.table approach:
library(data.table)
DT <- data.table(mydf)
DT[, .id := sequence(.N), by = "IDA,IDB"]
DT
# IDA IDB values .id
# 1: a 1 1 1
# 2: a 2 2 1
# 3: a 1 3 2
# 4: b 1 4 1
# 5: b 2 5 1
# 6: b 2 6 2
# 7: b 2 7 3
meanwhile, you can also use dplyr. if your data.frame is called mydata
library(dplyr)
mydata %>% group_by(CustomerID) %>% mutate(TripCounter = row_number())
I need to do this often, and wrote a function that accomplishes it differently than the previous answers. I am not sure which solution is most efficient.
idCounter <- function(x) {
unlist(lapply(rle(x)$lengths, seq_len))
}
mydf$TripCounter <- idCounter(mydf$CustomerID)
Here's the procedure styled code. I dont believe in things like if you are using loop in R then you are probably doing something wrong
x <- dataframe$CustomerID
dataframe$counter <- 0
y <- dataframe$counter
count <- 1
for (i in 1:length(x)) {
ifelse (x[i] == x[i-1], count <- count + 1, count <- 1 )
y[i] <- count
}
dataframe$counter <- y
This isn't the right answer but showing some interesting things comparing to for loops, vectorization is fast does not care about sequential updating.
a<-read.table(textConnection(
"CustomerID TripDate
1 1/3/2013
1 1/4/2013
1 1/9/2013
2 2/1/2013
2 2/4/2013
3 1/2/2013 "), header=TRUE)
a <- a %>%
group_by(CustomerID,TripDate) # must in order
res <- rep(1, nrow(a)) #base # 1
res[2:6] <-sapply(2:6, function(i)if(a$CustomerID[i]== a$CustomerID[i - 1]) {res[i] = res[i-1]+1} else {res[i]= res[i]})
a$TripeCounter <- res

How to assign a counter to a specific subset of a data.frame which is defined by a factor combination?

My question is: I have a data frame with some factor variables. I now want to assign a new vector to this data frame, which creates an index for each subset of those factor variables.
data <-data.frame(fac1=factor(rep(1:2,5)), fac2=sample(letters[1:3],10,rep=T))
Gives me something like:
fac1 fac2
1 1 a
2 2 c
3 1 b
4 2 a
5 1 c
6 2 b
7 1 a
8 2 a
9 1 b
10 2 c
And what I want is a combination counter which counts the occurrence of each factor combination. Like this
fac1 fac2 counter
1 1 a 1
2 2 c 1
3 1 b 1
4 2 a 1
5 1 c 1
6 2 b 1
7 1 a 2
8 2 a 2
9 1 b 2
10 1 a 3
So far I thought about using tapply to get the counter over all factor-combinations, which works fine
counter <-tapply(data$fac1, list(data$fac1,data$fac2), function(x) 1:length(x))
But I do not know how I can assign the counter list (e.g. unlisted) to the combinations in the data-frame without using inefficient looping :)
This is a job for the ave() function:
# Use set.seed for reproducible examples
# when random number generation is involved
set.seed(1)
myDF <- data.frame(fac1 = factor(rep(1:2, 7)),
fac2 = sample(letters[1:3], 14, replace = TRUE),
stringsAsFactors=FALSE)
myDF$counter <- ave(myDF$fac2, myDF$fac1, myDF$fac2, FUN = seq_along)
myDF
# fac1 fac2 counter
# 1 1 a 1
# 2 2 b 1
# 3 1 b 1
# 4 2 c 1
# 5 1 a 2
# 6 2 c 2
# 7 1 c 1
# 8 2 b 2
# 9 1 b 2
# 10 2 a 1
# 11 1 a 3
# 12 2 a 2
# 13 1 c 2
# 14 2 b 3
Note the use of stringsAsFactors=FALSE in the data.frame() step. If you didn't have that, you can still get the output with: myDF$counter <- ave(as.character(myDF$fac2), myDF$fac1, myDF$fac2, FUN = seq_along).
A data.table solution
library(data.table)
DT <- data.table(data)
DT[, counter := seq_len(.N), by = list(fac1, fac2)]
This is a base R way that avoids (explicit) looping.
data$counter <- with(data, {
inter <- as.character(interaction(fac1, fac2))
names(inter) <- seq_along(inter)
inter.ordered <- inter[order(inter)]
counter <- with(rle(inter.ordered), unlist(sapply(lengths, sequence)))
counter[match(names(inter), names(inter.ordered))]
})
Here a variant with a little looping (I have renamed your variable to "x" since "data" is being used otherwise):
x <-data.frame(fac1=rep(1:2,5), fac2=sample(letters[1:3],10,rep=T))
x$fac3 <- paste( x$fac1, x$fac2, sep="" )
x$ctr <- 1
y <- table( x$fac3 )
for( i in 1 : length( rownames( y ) ) )
x$ctr[x$fac3 == rownames(y)[i]] <- 1:length( x$ctr[x$fac3 == rownames(y)[i]] )
x <- x[-3]
No idea whether this is efficient over a large data.frame but it works!

Resources