Reordering rows in a dataframe that follow particular sequence - r

I want to reorder rows in the following data frame if the Sequence is 2,1,3. However, this should only apply in instances where the Project ID is the same. So this logic should reorder the rows in Project 123, but should not effect Projects 124 or 125.
Here is the data:
Data <- data.frame(Project=c(123,123,123,124,125,125),
Value=c(1,4,7,3,8,9),
Sequence=c(2,1,3,2,1,3))
This is the result I'm looking for:
Result <- data.frame(Project=c(123,123,123,124,125,125),
Value=c(4,1,7,3,8,9),
Sequence=c(1,2,3,2,1,3))

Not a realy smart piece of work, but it should work - try this...
This function sorts the data rows of the specified sequence, if it appears within a project.(you could also change the sequence)
reorder <- function(data, sequence=c(2,1,3)){
seq_len <- length(sequence)
for (i in 1:(dim(data)[1]-seq_len)){
seq_check <- identical(sequence, data$Sequence[i:(i+seq_len-1)])
if (seq_check) {
pro_check <- identical(rep(data$Project[i], seq_len), data$Project[i:(i+seq_len-1)]) #check if Project is the same over the sequence
if (pro_check){
exchange <- data[i:(i+seq_len-1),]
for (j in 1:seq_len){
data[(i+j-1),] <- exchange[exchange$Sequence==j,]
}
}
}
}
data
}
running:
Data_reordered <- reorder(Data)
results in:
> Data_reordered
Project Value Sequence
1 123 4 1
2 123 1 2
3 123 7 3
4 124 3 2
5 125 8 1
6 125 9 3
and:
> identical(Result, Data_reordered)
[1] TRUE
I hope that is your requested solution :-)

Related

Removing rows from a data frame until a condition is met

I have a function, remove_fun, that removes rows from a data frame based on some conditions (this function is too verbose to include, so here's a simplified example:).
Let's say I have a data frame called block_2, with two columns:
Treatment seq
1 29
1 23
3 60
1 6
2 41
1 5
2 44
For the sake of this example, let's say my function removes 1 row from block_2 at a time based on the highest value of seq in block_2$seq. This function works well when I run it once, i.e. remove_fun(block_2) would return the following output:
Treatment seq
1 29
1 23
1 6
2 41
1 5
2 44
However, what I'm not figuring out is how to repeatedly implement my remove_fun until I reduce block_2 to a certain dimension.
My idea is to do something like this:
while (dim(block_2_df)[1]>1)#The number of rows of block_2_df{
remove_fun(block_2_df)
}
This would theoretically reduce block_2_df until only the observation corresponding to the lowest seq number remains.
However, this doesn't work. I think my problem relates to me not knowing how to use my 'updated' block_2_df iteratively. What I'd like to accomplish is some code that does something like this:
new_df_1<-remove_fun(block_2)
new_df_2<-remove_fun(new_df_1)
new_df_3<-remove_fun(new_df_2)
etc...
I'm not necessarily looking for an exact solution to this problem (as I didn't provide remove_fun), but I'd appreciate some insight re: a general approach to the problem.
Edit: here's my actual code with some example data:
#Start from a block of 10*6 balls, with lambda*(wj) balls of each class
#Allocation ratios
class_1<-"a"
class_2<-"b"
class_3<-"c"
ratio_a<-3
ratio_b<-2
ratio_c<-1
#Min_set
min_set<-c(rep(class_1,ratio_a),rep(class_2,ratio_b),rep(class_3,ratio_c))
min_set_num<-ifelse(min_set=='a',1,ifelse(min_set=='b',2,3))
table_key <- table(min_set_num)
#Number of min_sets
lamb<-10
#Active urn
block_1<-matrix(0,lamb,length(min_set))
for (i in 1:lamb){
block_1[i,]<-min_set
}
#Turn classes into a vector
block_1<-as.vector(block_1)
block_1<-ifelse(block_1=='a',1,ifelse(block_1=='b',2,3))
#Turn into a df w/ identifying numbers:
block_1_df<-data.frame(block_1,seq(1:length(block_1)))
#Enumerate all sampling outcome permutations
library('dplyr')
#Create inactive urn
#Sample from block_1 until min_set is achieved, store in block_2#####
#Random sample :
block_2<-sample(block_1,length(block_1),replace=F)
block_2_df<-block_1_df[sample(nrow(block_1_df), length(block_1)), ]
colnames(block_2_df)<-c('Treatment','seq')
#Generally:####
remove_fun<-function(dat){
#For df
min_set_obs_mat<-matrix(0,length(block_1),2)
min_set_obs_df<-as.data.frame(min_set_obs_mat)
colnames(min_set_obs_df)<-c('Treatment','seq')
for (i in 1:length(block_1)){
if ((sum(min_set_obs_df[,1]==1)<3) || (sum(min_set_obs_df[,1]==2)<2) || (sum(min_set_obs_df[,1]==3)<1)){
min_set_obs_df[i,]<-dat[i,]
}
}
#Get rid of empty rows in df:
min_set_obs_df<-min_set_obs_df%>%filter(Treatment>0)
#Return the sampled 'balls' which satisfy the minimum set into block_2_df (randomized block_!), ####
#keeping the 'extra' balls in a new df: extra_df:####
#Question: does the order of returning matter?####
#Identify min_set
outcome_df<-min_set_obs_df %>% group_by(Treatment) %>% do({
head(., coalesce(table_key[as.character(.$Treatment[1])], 0L))
})
#This removes extra observations 'chronologically'
#Identify extra balls
#Extra_df is the 'inactive' urn####
extra_df<-min_set_obs_df%>%filter(!(min_set_obs_df$seq%in%outcome_df$seq))
#Question: is the number of pts equal to the block size? (lambda*W)?######
#Return min_df back to block_2_df, remove extra_df from block_2_df:
dat<-dat%>%filter(!(seq%in%extra_df$seq))
return(dat)
}
Your while-loop doesn't redefine block2_df. This should work:
while (dim(block_2_df)[1]>1) {
block_2_df <- remove_fun(block_2_df)
}
If all you need is a way to subset the data frame...
df <- data.frame(Treatment = c(1, 1, 3, 1, 2, 1, 2),
seq = c(29, 23, 60, 6, 41, 5, 44))
df
Treatment seq
1 1 29
2 1 23
3 3 60
4 1 6
5 2 41
6 1 5
7 2 44
# Decide how many rows you want in output
n <- 6
# Find the top "n" values in the seq variable
head(sort(df$seq), n)
[1] 5 6 23 29 41 44
# Use them in the subset criteria
df[df$seq %in% head(sort(df$seq), n), ]
Treatment seq
1 1 29
2 1 23
4 1 6
5 2 41
6 1 5
7 2 44

R enumerate duplicates in a dataframe with unique value

I have a dataframe containing a set of parts and test results. The parts are tested on 3 sites (North Centre and South). Sometimes those parts are re-tested. I want to eventually create some charts that compare the results from the first time that a part was tested with the second (or third, etc.) time that it was tested, e.g. to look at tester repeatability.
As an example, I've come up with the below code. I've explicitly removed the "Experiment" column from the morley data set, as this is the column I'm effectively trying to recreate. The code works, however it seems that there must be a more elegant way to approach this problem. Any thoughts?
Edit - I realise that the example given was overly simplistic for my actual needs (I was trying to generate a reproducible example as easily as possible).
New example:
part<-as.factor(c("A","A","A","B","B","B","A","A","A","C","C","C"))
site<-as.factor(c("N","C","S","C","N","S","N","C","S","N","S","C"))
result<-c(17,20,25,51,50,49,43,45,47,52,51,56)
data<-data.frame(part,site,result)
data$index<-1
repeat {
if(!anyDuplicated(data[,c("part","site","index")]))
{ break }
data$index<-ifelse(duplicated(data[,1:2]),data$index+1,data$index)
}
data
part site result index
1 A N 17 1
2 A C 20 1
3 A S 25 1
4 B C 51 1
5 B N 50 1
6 B S 49 1
7 A N 43 2
8 A C 45 2
9 A S 47 2
10 C N 52 1
11 C S 51 1
12 C C 56 1
Old example:
#Generate a trial data frame from the morley dataset
df<-morley[,c(2,3)]
#Set up an iterative variable
#Create the index column and initialise to 1
df$index<-1
# Loop through the dataframe looking for duplicate pairs of
# Runs and Indices and increment the index if it's a duplicate
repeat {
if(!anyDuplicated(df[,c(1,3)]))
{ break }
df$index<-ifelse(duplicated(df[,c(1,3)]),df$index+1,df$index)
}
# Check - The below vector should all be true
df$index==morley$Expt
We may use diff and cumsum on the 'Run' column to get the expected output. In this method, we are not creating a column of 1s i.e 'index' and also assuming that the sequence in 'Run' is ordered as showed in the OP's example.
indx <- cumsum(c(TRUE,diff(df$Run)<0))
identical(indx, morley$Expt)
#[1] TRUE
Or we can use ave
indx2 <- with(df, ave(Run, Run, FUN=seq_along))
identical(indx2, morley$Expt)
#[1] TRUE
Update
Using the new example
with(data, ave(seq_along(part), part, site, FUN=seq_along))
#[1] 1 1 1 1 1 1 2 2 2 1 1 1
Or we can use getanID from library(splitstackshape)
library(splitstackshape)
getanID(data, c('part', 'site'))[]
I think this is a job for make.unique, with some manipulation.
index <- 1L + as.integer(sub("\\d+(\\.)?","",make.unique(as.character(morley$Run))))
index <- ifelse(is.na(index),1L,index)
identical(index,morley$Expt)
[1] TRUE
Details of your actual data.frame may matter. However, a couple of options working with your example:
#this works if each group starts with 1:
df$index<-cumsum(df$Run==1)
#this is maybe more general, with data.table
require(data.table)
dt<-as.data.table(df)
dt[,index:=seq_along(Speed),by=Run]

Eliminating rows from a data.frame

I have this example data.frame:
df <- data.frame(id=c("a","a,b,c","d,e","d","h","e","i","b","c"), start=c(100,100,400,400,800,500,900,200,300), end=c(150,350,550,450,850,550,950,250,350), level = c(1,5,2,3,6,4,2,1,1))
> df
id start end level
1 a 100 150 1
2 a,b,c 100 350 5
3 d,e 400 550 2
4 d 400 450 3
5 h 800 850 6
6 e 500 550 4
7 i 900 950 2
8 b 200 250 1
9 c 300 350 1
where each row is a linear interval.
As this example shows some rows are merged intervals (rows 2 and 3).
What I'd like to do is for each merged interval either eliminate all its individual parts from df if the df$level of the merged interval is greater than that of all its parts, or if the df$level of the merged interval is smaller than at least one of its parts eliminate the merged interval.
So for this example, the output should be:
> res.df
id start end level
1 a,b,c 100 350 5
2 d 400 450 3
3 h 800 850 6
4 e 500 550 4
5 i 900 950 2
Method 1 (ID values)
So If we can assume that all the "merged" group have ID names that are a comma separated list of the individual groups, then we can tackle this problem just looking at the IDs and ignore the start/end information. Here is one such method
First, find all the "merged" groups by finding the IDs with commas
groups<-Filter(function(x) length(x)>1,
setNames(strsplit(as.character(df$id),","),df$id))
Now, for each of those groups, determine who has the larger level, either the merged group or one of the individual groups. Then return the index of the rows to drop as a negative number
drops<-unlist(lapply(names(groups), function(g) {
mi<-which(df$id==g)
ii<-which(df$id %in% groups[[g]])
if(df[mi, "level"] > max(df[ii, "level"])) {
return(-ii)
} else {
return(-mi)
}
}))
And finally, drop those from the data.frame
df[drops,]
# id start end level
# 2 a,b,c 100 350 5
# 4 d 400 450 3
# 5 h 800 850 6
# 6 e 500 550 4
# 7 i 900 950 2
Method 2 (Start/End Graph)
I wanted to also try a method that ignored the (very useful) merged ID names and just looked at the start/end positions. I may have gone off in a bad direction but this lead me to think of it as a network/graph type problem so I used the igraph library.
I created a graph where each vertex represented a start/end position. Each edge therefore represented a range. I used all the ranges from the sample data set and filled in any missing ranges to make the graph connected. I merged that data together to create an edge list. For each edge, I remember the "level" and "id" values from the original data set. Here's the code to do that
library(igraph)
poslist<-sort(unique(c(df$start, df$end)))
seq.el<-embed(rev(poslist),2)
class(seq.el)<-"character"
colnames(seq.el)<-c("start","end")
el<-rbind(df[,c("start","end","level", "id")],data.frame(seq.el, level=0, id=""))
el<-el[!duplicated(el[,1:2]),]
gg<-graph.data.frame(el)
And that creates a graph that looks like
So basically we want to eliminate cycles in the graph by taking the path with the edge that has the maximum "level" value. Unfortunately since this isn't a normal path-weighting scheme, I didn't find an easy way to do this with a default algorithm (maybe I missed it). So I had to write my own graph transversal function. It's not as pretty as I would have liked, but here it is.
findPath <- function(gg, fromv, tov) {
if ((missing(tov) && length(incident(gg, fromv, "in"))>1) ||
(!missing(tov) && V(gg)[fromv]==V(gg)[tov])) {
return (list(level=0, path=numeric()))
}
es <- E(gg)[from(fromv)]
if (length(es)>1) {
pp <- lapply(get.edges(gg, es)[,2], function(v) {
edg <- E(gg)[fromv %--% v]
lvl <- edg$level
nxt <- findPaths(gg,v)
return (list(level=max(lvl, nxt$level), path=c(edg,nxt$path)))
})
lvl <- sapply(pp, `[[`, "level")
take <- pp[[which.max(lvl)]]
nxt <- findPaths(gg, get.edges(gg, tail(take$path,1))[,2], tov)
return (list(level=max(take$level, nxt$level), path=c(take$path, nxt$path)))
} else {
lvl <- E(gg)[es]$level
nv <- get.edges(gg,es)[,2]
nxt <- findPaths(gg, nv, tov)
return (list(level=max(lvl, nxt$level), path=c(es, nxt$path)))
}
}
This will find a path between two nodes that satisfies the property of having a maximal level when presented with a branch. We call that with this data set with
rr <- findPaths(gg, "100","950")$path
This will find the final path. Since each row in the original df data.frame is represented by an edge, we just need to extract the edges from the path that correspond to the final path. This actually gives us a path that looks like
where the red path is the chosen one. I can then subset df with
df[df$id %in% na.omit(E(gg)[rr]$id), ]
# id start end level
# 2 a,b,c 100 350 5
# 4 d 400 450 3
# 5 h 800 850 6
# 6 e 500 550 4
# 7 i 900 950 2
Method 3 (Overlap Matrix)
He's another way to look at the start/stop positions. I create a matix where columns correspond to ranges in the rows of the data.frame and the rows of the matrix correspond to positions. Each value in the matrix is true if a range overlaps a position. Here I use the between.R helper function
#find unique positions and create overlap matrix
un<-sort(unique(unlist(df[,2:3])))
cc<-sapply(1:nrow(df), function(i) between(un, df$start[i], df$end[i]))
#partition into non-overlapping sections
groups<-cumsum(c(F,rowSums(cc[-1,]& cc[-nrow(cc),])==0))
#find the IDs to keep from each section
keeps<-lapply(split.data.frame(cc, groups), function(m) {
lengths <- colSums(m)
mx <- which.max(lengths)
gx <- setdiff(which(lengths>0), mx)
if(length(gx)>0) {
if(df$level[mx] > max(df$level[gx])) {
mx
} else {
gx
}
} else {
mx
}
})
This will give a list of the IDs to keep from each group, and we can get the final data.set with
df[unlist(keeps),]
Method 4 (Open/Close Listing)
I have one last method. This one might be the most scalable. We basically melt the positions and keep track of opening and closing events to identify the groups. Then we split and see if the longest in each group has the max level or not. Ultimately we return the IDs. This method uses all standard base functions.
#create open/close listing
dd<-rbind(
cbind(df[,c(1,4)],pos=df[,2], evt=1),
cbind(df[,c(1,4)],pos=df[,3], evt=-1)
)
#annotate with useful info
dd<-dd[order(dd$pos, -dd$evt),]
dd$open <- cumsum(dd$evt)
dd$group <- cumsum(c(0,head(dd$open,-1)==0))
dd$width <- ave(dd$pos, dd$id, FUN=function(x) diff(range(x)))
#slim down
dd <- subset(dd, evt==1,select=c("id","level","width","group"))
#process each group
ids<-unlist(lapply(split(dd, dd$group), function(x) {
if(nrow(x)==1) return(x$id)
mw<-which.max(x$width)
ml<-which.max(x$level)
if(mw==ml) {
return(x$id[mw])
} else {
return(x$id[-mw])
}
}))
and finally subset
df[df$id %in% ids, ]
by now I think you know what this returns
Summary
So if your real data has the same type of IDs as the sample data, obviously method 1 is a better, more direct choice. I'm still hoping there is a way to simplify method 2 that i'm just missing. I've not done any testing on efficiency or performance of these methods. I'm guessing method 4 might be be the most efficient since it should scale linearly.
I'll take a procedural approach; basically, sort descending by level,
and for each record, remove later records that have a matching id.
df <- data.frame(id=c("a","a,b,c","d,e","d","h","e","i","b","c"), start=c(100,100,400,400,800,500,900,200,300), end=c(150,350,550,450,850,550,950,250,350),
level = c(1,5,2,3,6,4,2,1,1), stringsAsFactors=FALSE)
#sort
ids <- df[order(df$level, decreasing=TRUE), "id"]
#split
ids <- sapply(df$id, strsplit, ",")
i <- 1
while( i < length(ids)) {
current <- ids[[i]]
j <- i + 1
while(j <= length(ids)) {
if(any(ids[[j]] %in% current))
ids[[j]] <- NULL
else
j <- j + 1
}
i <- i + 1
}
And finally, only keep the ids that are left:
R> ids <- data.frame(id=names(ids), stringsAsFactors=FALSE)
R> merge(ids, df, sort=FALSE)
id start end level
1 h 800 850 6
2 a,b,c 100 350 5
3 e 500 550 4
4 d 400 450 3
5 i 900 950 2
This has ugly while loops because R only has for-each loops, and also note the stringsAsFactors=FALSE is necessary for splitting the ids. Deleting middle elements
could be bad for performance, but that will depend on the underlying implementation
R uses for lists (linked vs arrays).

Comparing two columns: logical- is value from column 1 also in column 2?

I'm pretty confused on how to go about this. Say I have two columns in a dataframe. One column a numerical series in order (x), the other specifying some value from the first, or -1 (y). These are results from a matching experiment, where the goal is to see if multiple photos are taken of the same individual. In the example below, there 10 photos, but 6 are unique individuals. In the y column, the corresponding x is reported if there is a match. y is -1 for no match (might as well be NAs). If there is more than 2 photos per individual, the match # will be the most recent record (photo 1, 5 and 7 are the same individual below). The group is the time period the photo was take (no matches within a group!). Hopefully I've got this example right:
x <- c(1,2,3,4,5,6,7,8,9,10)
y <- c(-1,-1,-1,-1,1,-1,1,-1,2,4)
group <- c(1,1,1,2,2,2,3,3,3,3)
DF <- data.frame(x,y,group)
I would like to create a new variable to name the unique individuals, and have a final dataset with a single row per individual (i.e. only have 6 rows instead of 10), that also includes the group information. I.e. if an individual is in all three groups, there could be a value of "111" or if just in the first and last group it would be "101". Any tips?
Thanks for asking about the resulting dataset. I realized my group explanation was bad based on the actual numbers I gave, so I changed the results slightly. Bonus would also be nice to have, but not critical.
name <- c(1,2,3,4,6,8)
group_history <- as.character(c('111','101','100','011','010','001'))
bonus <- as.character(c('1,5,7','2,9','3','4,10','6','8'))
results_I_want <- data.frame(name,group_history,bonus)
My word, more mistakes fixed above...
Using the (updated) example you gave
x <- c(1,2,3,4,5,6,7,8,9,10)
y <- c(-1,-1,-1,-1,1,-1,1,-1,3,4)
group <- c(1,1,1,2,2,2,3,3,3,3)
DF <- data.frame(x,y,group)
Use the x and y to create a mapping from higher numbers to lower numbers that are the same person. Note that names is a string, despite it be a string of digits.
bottom.df <- DF[DF$y==-1,]
mapdown.df <- DF[DF$y!=-1,]
mapdown <- c(mapdown.df$y, bottom.df$x)
names(mapdown) <- c(mapdown.df$x, bottom.df$x)
We don't know how many times it might take to get everything down to the lowest number, so have to use a while loop.
oldx <- DF$x
newx <- mapdown[as.character(oldx)]
while(any(oldx != newx)) {
oldx = newx
newx = mapdown[as.character(oldx)]
}
The result is the group it belongs to, names by the lowest number of that set.
DF$id <- unname(newx)
Getting the group membership is harder. Using reshape2 to convert this into wide format (one column per group) where the column is "1" if there was something in that one and "0" if not.
library("reshape2")
wide <- dcast(DF, id~group, value.var="id",
fun.aggregate=function(x){if(length(x)>0){"1"}else{"0"}})
Finally, paste these "0"/"1" memberships together to get the grouping variable you described.
wide$grouping = apply(wide[,-1], 1, paste, collapse="")
The result:
> wide
id 1 2 3 grouping
1 1 1 1 1 111
2 2 1 0 0 100
3 3 1 0 1 101
4 4 0 1 1 011
5 6 0 1 0 010
6 8 0 0 1 001
No "bonus" yet.
EDIT:
To get the bonus information, it helps to redo the mapping to keep everything. If you have a lot of cases, this could be slow.
Replace the oldx/newx part with:
iterx <- matrix(DF$x, ncol=1)
iterx <- cbind(iterx, mapdown[as.character(iterx[,1])])
while(any(iterx[,ncol(iterx)]!=iterx[,ncol(iterx)-1])) {
iterx <- cbind(iterx, mapdown[as.character(iterx[,ncol(iterx)])])
}
DF$id <- iterx[,ncol(iterx)]
To generate the bonus data, then you can use
bonus <- tapply(iterx[,1], iterx[,ncol(iterx)], paste, collapse=",")
wide$bonus <- bonus[as.character(wide$id)]
Which gives:
> wide
id 1 2 3 grouping bonus
1 1 1 1 1 111 1,5,7
2 2 1 0 0 100 2
3 3 1 0 1 101 3,9
4 4 0 1 1 011 4,10
5 6 0 1 0 010 6
6 8 0 0 1 001 8
Note this isn't same as your example output, but I don't think your example output is right (how can you have a grouping_history of "000"?)
EDIT:
Now it agrees.
Another solution for bonus variable
f_bonus <- function(data=df){
data_a <- subset(data,y== -1,select=x)
data_a$pos <- seq(nrow(data_a))
data_b <- subset(df,y!= -1,select=c(x,y))
data_b$pos <- match(data_b$y, data_a$x)
data_t <- rbind(data_a,data_b[-2])
data_t <- with(data_t,tapply(x,pos,paste,sep="",collapse=","))
return(data_t)
}

How to assign number of repeats to dataframe based on elements of an identifying vector in R?

I have a dataframe with individuals assigned a text id that concatenates a place-name with a personal id (see data, below). Ultimately, I need to do a transformation of the data set from "long" to "wide" (e.g., using "reshape") so that each individual comprises one row, only. In order to do that, I need to assign a "time" variable that reshape can use to identify time-varying covariates, etc. I have (probably bad) code to do this for individuals that repeat up to two times, but need to be able to identify up to 18 repeated occurrences. The code below works fine if I remove the line preceded by the hash, but only identifies up to two repeats. If I leave that line in (which would seem necessary for individuals repeated more than twice), R chokes, giving the following error (presumably because the first individual is repeated only twice):
Error in if (data$uid[i] == data$uid[i - 2]) { :
argument is of length zero
Can anyone help with this? Thanks in advance!
place <- rep("ny",10)
pid <- c(1,1,2,2,2,3,4,4,5,5)
uid<- paste(place,pid,sep="")
time <- rep(0,10)
data <- cbind(uid,time)
data <- as.data.frame(data)
data$time <- as.numeric(data$time)
#bad code
data$time[1] <- 1 #need to set first so that loop doesn't go to a row that doesn't exist (i.e., row 0)
for (i in 2:NROW(data)){
data$time[i] <- 1 #set first occurrence to 1
if (data$uid[i] == data$uid[i-1]) {data$time[i] <- 2} #set second occurrence to 2, etc.
#if (data$uid[i] == data$uid[i-2]) {data$time[i] <- 3}
i <- i+1
}
It's unclear what you are trying to do, but I think you're saying that you need to create a time index for each row by every unique uid. Is that right?
If so, give this a whirl
library(plyr)
ddply(data, "uid", transform, time = seq_along(uid))
Will give you something like:
uid time
1 ny1 1
2 ny1 2
3 ny2 1
4 ny2 2
5 ny2 3
....
Is this what you have in mind?
> d <- data.frame(uid = paste("ny",c(1,2,1,2,2,3,4,4,5,5),sep=""))
> out <- do.call(rbind, lapply(split(d, d$uid), function(x) {x$time <- 1:nrow(x); x}))
> rownames(out) <- NULL
> out
uid time
1 ny1 1
2 ny1 2
3 ny2 1
4 ny2 2
5 ny2 3
6 ny3 1
7 ny4 1
8 ny4 2
9 ny5 1
10 ny5 2
Using your data frame setup:
place <- rep("ny",10)
pid <- c(1,1,2,2,2,3,4,4,5,5)
uid<- paste(place,pid,sep="")
time <- rep(0,10)
data <- cbind(uid,time)
data <- as.data.frame(data)
You can use:
data$time <- sequence(table(data$uid))
data
To get:
> data
uid time
1 ny1 1
2 ny1 2
3 ny2 1
4 ny2 2
5 ny2 3
6 ny3 1
7 ny4 1
8 ny4 2
9 ny5 1
10 ny5 2
NOTE: Your data.frame MUST be sorted by uid first for this to work.
After trying the above solutions on large data sets, I decided to write my own loop for this. It was very time-consuming and still required the data to be broken into 50k-element vectors, but it did work in the end:
system.time( for(i in 2:length(data$uid)) {
if(data$uid[i]==data$uid[i-1]) data$repeats[i] <- data$repeats[i-1]+1
if ((i %% 1000)== 0) { #helps to keep track of how far the loop has gotten
print(i) }
i+1
}
)
Thanks to all for your help.

Resources