I have the fmatch function working in a loop but I was wondering if it's possible to apply this functionality to the vector all once rather than looping through.
Here is the code running through the loop, which currently works.
library(readxl)
library(data.table)
library(plyr)
library(tidyr)
library(dplyr)
library(tibble)
library(fastmatch)
library(stringr)
library(magrittr)
library(RcppBDT)
##library(anytime)
## Load time zone data sheet
TZData <- read_excel("TZDataFile.xlsx")
TZData <- as.data.table(TZData)
TZRange <- TZData[,1]
TZRange <- as.data.frame(TZRange)
##Bring in test data
TD <- read_excel("Test dates.xlsx", col_types = c("text", "text"))
TD <- as.data.table(TD)
####Start Time Conversion Code####
## Define variables
Station <- TD[,1] ##Station
GMT <- TD[,2] ##Date/time stamp in GMT to be converted to local
z <- nrow(TD)+0
APLDateTime <- data.frame(RawLocal = double(), RawLocalDateTime = as.Date(character()))
for (i in 1:z) {
STA <- as.character(Station[i,1]) ## Get Station
APCode <- as.integer(fmatch(STA, TZRange[,1])) ## Match station on Time Zone Data sheet
When I try to just run
STA <- as.character(Station[,1]) ## Get Station
APCode <- as.integer(fmatch(STA, TZRange[,1])) ## Match station on Time Zone Data sheet
I get NA_integer_ for APCode.
Sample Data:
> STA
[1] "c(\"LHR\", \"PHL\", \"DFW\", \"PHX\", \"LAX\", \"BCN\")"
> head(TZRange,10)
Code
1 369
2 04G
3 06A
4 06C
5 06N
6 09J
7 0A9
8 0G6
9 0G7
10 0P2
1183 DFW
2748 LHR
3809 PHL
I am looking for a result like
APCode = c(2748, 3809, 1183, etc.)
Thanks for the help.
First I assume you want STA to be a character vector. As I do not have your full data I will use the one provided and convert it
STA<-"c(\"LHR\", \"PHL\", \"DFW\", \"PHX\", \"LAX\", \"BCN\")"%>%substring(2)%>%str_replace_all("[[:punct:]]","")
> STA
[1] "LHR PHL DFW PHX LAX BCN"
Let me put an extra value so it finds a match to TZRange:
STA=c(STA,"04G")
> STA
[1] "LHR PHL DFW PHX LAX BCN" "04G"
For TZRange I have kept the 10 first values your provided
> TZRange
code
1 369
2 04G
3 06A
4 06C
5 06N
6 09J
7 0A9
8 0G6
9 0G7
10 0P2
Now you can specify the index of TZRange where matches with STA are found
APCode <- na.omit(fmatch(STA,TZRange[,1]))[1]
> APCode
[1] 2
Hope this helps
I got it working! I had to set both STA and TZRange as.data.frame then run the code APCode <- fmatch(STA[,1], TZRange[,1]) and it worked perfectly. I did first try the na.omit Antonis suggested, but it did not give me the list of indexes like I was looking for. Thanks for the assistance.
Related
With some effort and help from the stackers, I have been able to parse a webpage and save it as a dataframe. I want to repeat the same operation on multiple xml files and rbind the list. Here is what I tried and did successfully:
library(XML)
xml.url <- "http://www.ebi.ac.uk/ena/data/view/ERS445758&display=xml"
doc <- xmlParse(xml.url)
x <- xmlToDataFrame(getNodeSet(doc,"//SAMPLE_ATTRIBUTE"))
x$UNITS <- NULL
x_t <- t(x)
x_t <- as.data.frame(x_t)
names(x_t) <- as.matrix(x_t[1, ])
x_t <- x_t[-1, ]
x_t[] <- lapply(x_t, function(x) type.convert(as.character(x)))
Above code works well, now when I try to apply a function to do the same for multiple xml files :
ERS_ID <- c("ERS445758","ERS445759", "ERS445760", "ERS445761", "ERS445762")
xml_url_test = as.vector(sprintf("http://www.ebi.ac.uk/ena/data/view/ERS445758&display=xml",
ERS_ID))
XML_parser <- function(XML_url){
doc <- xmlParse(XML_url)
x <- xmlToDataFrame(getNodeSet(doc,"//SAMPLE_ATTRIBUTE"))
x$UNITS <- NULL
x_t <- t(x)
x_t <- as.data.frame(x_t)
names(x_t) <- as.matrix(x_t[1, ])
x_t <- x_t[-1, ]
x_t[] <- lapply(x_t, function(x) type.convert(as.character(x)))
return(x_t)
}
major_test <- sapply(xml_url_test, XML_parser)
It works, but gives me a long list that is not in the right data frame format as I generated for the single XML file.
Finally I would like to also add a column to the final dataframe that has the ERS number from the ERS_ID vector
Something like x_t$ERSid <- ERS_ID in the function
Can someone point out what am I missing in the function as well as any better ways to do the task?
Thanks!
Your main issue is using sapply over lapply() where the latter returns a list and former attempts to simplify to a vector or matrix, here being a matrix.
major_test <- lapply(xml_url_test, XML_parser)
Of course, sapply is a wrapper for lapply and can also return a list: sapply(..., simplify=FALSE):
major_test <- sapply(xml_url_test, XML_parser, simplify=FALSE)
However, a few other items came up:
At beginning, you are not concatenating your ERS_ID to the url stem with sprintf's %s operator. So right now, the same urls are repeating.
At end, you are not binding your list of data frames into a compiled final single dataframe.
Add new ERS column inside your defined function, passing in ERS_ID vector. And while creating column, also remove the ERS prefix with gsub.
R code (adjusted)
XML_parser <- function(eid) {
XML_url <- as.vector(sprintf("http://www.ebi.ac.uk/ena/data/view/%s&display=xml", eid))
doc <- xmlParse(XML_url)
x <- xmlToDataFrame(getNodeSet(doc,"//SAMPLE_ATTRIBUTE"))
x$UNITS <- NULL
x_t <- t(x)
x_t <- as.data.frame(x_t)
names(x_t) <- as.matrix(x_t[1, ])
x_t <- x_t[-1, ]
x_t[] <- lapply(x_t, function(x) type.convert(as.character(x)))
x_t$ERSid <- gsub("ERS", "", eid) # ADD COL, REMOVE ERS
x_t <- x_t[,c(ncol(x_t),2:ncol(x_t)-1)] # MOVE NEW COL TO FIRST
return(x_t)
}
major_test <- lapply(ERS_ID, XML_parser)
# major_test <- sapply(ERS_ID, XML_parser, simplify=FALSE)
# BIND DATA FRAMES TOGETHER
finaldf <- do.call(rbind, major_test)
# RESET ROW NAMES
row.names(finaldf) <- seq(nrow(finaldf))
Using xml2 and the tidyverse you can do something like this:
require(xml2)
require(purrr)
require(tidyr)
urls <- rep("http://www.ebi.ac.uk/ena/data/view/ERS445758&display=xml", 2)
identifier <- LETTERS[seq_along(urls)] # Take a unique identifier per url here
parse_attribute <- function(x){
out <- data.frame(tag = xml_text(xml_find_all(x, "./TAG")),
value = xml_text(xml_find_all(x, "./VALUE")), stringsAsFactors = FALSE)
spread(out, tag, value)
}
doc <- map(urls, read_xml)
out <- doc %>%
map(xml_find_all, "//SAMPLE_ATTRIBUTE") %>%
set_names(identifier) %>%
map_df(parse_attribute, .id="url")
Which gives you a 2x36 data.frame. To parse the column type i would suggest using readr::type_convert(out)
Out looks as follows:
url age body product body site body-mass index chimera check collection date
1 A 28 mucosa Sigmoid colon 16.95502 ChimeraSlayer; Usearch 4.1 database 2009-03-16
2 B 28 mucosa Sigmoid colon 16.95502 ChimeraSlayer; Usearch 4.1 database 2009-03-16
disease status ENA-BASE-COUNT ENA-CHECKLIST ENA-FIRST-PUBLIC ENA-LAST-UPDATE ENA-SPOT-COUNT
1 remission 627051 ERC000015 2014-12-31 2016-10-21 1668
2 remission 627051 ERC000015 2014-12-31 2016-10-21 1668
environment (biome) environment (feature) environment (material) experimental factor
1 organism-associated habitat organism-associated habitat mucus microbiome
2 organism-associated habitat organism-associated habitat mucus microbiome
gastrointestinal tract disorder geographic location (country and/or sea,region) geographic location (latitude)
1 Ulcerative Colitis India 72.82807
2 Ulcerative Colitis India 72.82807
geographic location (longitude) host subject id human gut environmental package investigation type
1 18.94084 1 human-gut metagenome
2 18.94084 1 human-gut metagenome
medication multiplex identifiers pcr primers phenotype project name
1 ASA;Steroids;Probiotics;Antibiotics TGATACGTCT 27F-338R pathological BMRP
2 ASA;Steroids;Probiotics;Antibiotics TGATACGTCT 27F-338R pathological BMRP
sample collection device or method sequence quality check sequencing method sequencing template sex target gene
1 biopsy software pyrosequencing DNA male 16S rRNA
2 biopsy software pyrosequencing DNA male 16S rRNA
target subfragment
1 V1V2
2 V1V2
purrr is really helpful here, as you can iterate over a vector of URLs or a list of XML files with map, or within nested elements with at_depth, and simplify the results with the *_df forms and flatten.
library(tidyverse)
library(xml2)
# be kind, don't call this more times than you need to
x <- c("ERS445758","ERS445759", "ERS445760", "ERS445761", "ERS445762") %>%
sprintf("http://www.ebi.ac.uk/ena/data/view/%s&display=xml", .) %>%
map(read_xml) # read each URL into a list item
df <- x %>% map(xml_find_all, '//SAMPLE_ATTRIBUTE') %>% # for each item select nodes
at_depth(2, as_list) %>% # convert each (nested) attribute to list
map_df(map_df, flatten) # flatten items, collect pages to df, then all to one df
df
## # A tibble: 175 × 3
## TAG VALUE UNITS
## <chr> <chr> <chr>
## 1 investigation type metagenome <NA>
## 2 project name BMRP <NA>
## 3 experimental factor microbiome <NA>
## 4 target gene 16S rRNA <NA>
## 5 target subfragment V1V2 <NA>
## 6 pcr primers 27F-338R <NA>
## 7 multiplex identifiers TGATACGTCT <NA>
## 8 sequencing method pyrosequencing <NA>
## 9 sequence quality check software <NA>
## 10 chimera check ChimeraSlayer; Usearch 4.1 database <NA>
## # ... with 165 more rows
You can retrieve multiple IDs with a single REST url using a comma-separated list or range like ERS445758-ERS445762 and avoid multiple queries to the ENA.
This code gets all 5 samples into a node set and then applies functions using a leading dot in the xpath string so its relative to that node.
ERS_ID <- c("ERS445758","ERS445759", "ERS445760", "ERS445761", "ERS445762")
url <- paste0( "http://www.ebi.ac.uk/ena/data/view/", paste(ERS_ID, collapse=","), "&display=xml")
doc <- xmlParse(url)
samples <- getNodeSet( doc, "//SAMPLE")
## check the first node
samples[[1]]
## get the sample attribute node set and apply xmlToDataFrame to that
x <- lapply( lapply(samples, getNodeSet, ".//SAMPLE_ATTRIBUTE"), xmlToDataFrame)
# labels for bind_rows
names(x) <- sapply(samples, xpathSApply, ".//PRIMARY_ID", xmlValue)
library(dplyr)
y <- bind_rows(x, .id="sample")
z <- subset(y, TAG %in% c("age","sex","body site","body-mass index") , 1:3)
sample TAG VALUE
15 ERS445758 age 28
16 ERS445758 sex male
17 ERS445758 body site Sigmoid colon
19 ERS445758 body-mass index 16.9550173
50 ERS445759 age 58
51 ERS445759 sex male
...
library(tidyr)
z %>% spread( TAG, VALUE)
sample age body site body-mass index sex
1 ERS445758 28 Sigmoid colon 16.9550173 male
2 ERS445759 58 Sigmoid colon 23.22543185 male
3 ERS445760 26 Sigmoid colon 20.76124567 female
4 ERS445761 30 Sigmoid colon 0 male
5 ERS445762 36 Sigmoid colon 0 male
I am attempting to repeatedly add a "fixed number" to a numeric vector depending on a specified bin size. However, the "fixed number" is dependent on the data range.
For instance ; i have a data range 10 to 1010, and I wish to separate the data into 100 bins. Therefore ideally the data would look like this
Since 1010 - 10 = 1000
And 1000 / 100(The number of bin specified) = 10
Therefore the ideal data would look like this
bin1 - 10 (initial data)
bin2 - 20 (initial data + 10)
bin3 - 30 (initial data + 20)
bin4 - 40 (initial data + 30)
bin100 - 1010 (initial data + 1000)
Now the real data is slightly more complex, there is not just one data range but multiple data range, hopefully the example below would clarify
# Some fixed values
start <- c(10, 5000, 4857694)
end <- c(1010, 6500, 4897909)
Ideally I wish to get something like
10 20
20 30
30 40
.. ..
5000 5015
5015 5030
5030 5045
.. ..
4857694 4858096 # Note theoretically it would have decimal places,
#but i do not want any decimal place
4858096 4858498
.. ..
So far I was thinking along this kind of function, but it seems inefficient because ;
1) I have to retype the function 100 times (because my number of bin is 100)
2) I can't find a way to repeat the function along my values - In other words my function can only deal with the data 10-1010 and not the next one 5000-6500
# The range of the variable
width <- end - start
# The bin size (Number of required bin)
bin_size <- 100
bin_count <- width/bin_size
# Create a function
f1 <- function(x,y){
c(x[1],
x[1] + y[1],
x[1] + y[1]*2,
x[1] + y[1]*3)
}
f1(x= start,y=bin_count)
f1
[1] 10 20 30 40
Perhaps any hint or ideas would be greatly appreciated. Thanks in advance!
Aafter a few hours trying, managed to answer my own question, so I thought to share it. I used the package "binr" and the function in the package called "bins" to get the required bin. Please find below my attempt to answer my question, its slightly different than the intended output but for my purpose it still is okay
library(binr)
# Some fixed values
start <- c(10, 5000, 4857694)
end <- c(1010, 6500, 4897909)
tmp_list_start <- list() # Create an empty list
# This just extract the output from "bins" function into a list
for (i in seq_along(start)){
tmp <- bins(start[i]:end[i],target.bins = 100,max.breaks = 100)
# Now i need to convert one of the output from bins into numeric value
s <- gsub(",.*", "", names(tmp$binct))
s <- gsub("\\[","",s)
tmp_list_start[[i]] <- as.numeric(s)
}
# Repeating the same thing with slight modification to get the end value of the bin
tmp_list_end <- list()
for (i in seq_along(end)){
tmp <- bins(start[i]:end[i],target.bins = 100,max.breaks = 100)
e <- gsub(".*,", "", names(tmp$binct))
e <- gsub("]","",e)
tmp_list_end[[i]] <- as.numeric(e)
}
v1 <- unlist(tmp_list_start)
v2 <- unlist(tmp_list_end)
df <- data.frame(start=v1, end=v2)
head(df)
start end
1 10 20
2 21 30
3 31 40
4 41 50
5 51 60
6 61 70
Pardon my crappy code, Please share if there is a better way of doing this. Would be nice if someone could comment on how to wrap this into a function..
Here's a way that may help with base R:
bin_it <- function(START, END, BINS) {
range <- END-START
jump <- range/BINS
v1 <- c(START, seq(START+jump+1, END, jump))
v2 <- seq(START+jump-1, END, jump)+1
data.frame(v1, v2)
}
It uses the function seq to create the vectors of numbers leading to the ending number. It may not work for every case, but for the ranges you gave it should give the desired output.
bin_it(10, 1010)
v1 v2
1 10 20
2 21 30
3 31 40
4 41 50
5 51 60
bin_it(5000, 6500)
v1 v2
1 5000 5015
2 5016 5030
3 5031 5045
4 5046 5060
5 5061 5075
bin_it(4857694, 4897909)
v1 v2
1 4857694 4858096
2 4858097 4858498
3 4858499 4858900
4 4858901 4859303
5 4859304 4859705
6 4859706 4860107
I've been trying for quite some time to get my test data to split.
> FDF <- read.csv.ffdf(file='C:\\Users\\William\\Desktop\\R Data\\TestData0812.txt', header = FALSE, colClasses=c('factor','factor','numeric','numeric','numeric','numeric'), sep=',')
> names(FDF)<- c('Date','Time','Open','High','Low','Close')
>
> # ID
> FDF2 <-FDF[1:100,]
> FDF2 <- as.ffdf(FDF2)
> a <- nrow(FDF2)
> # Take section of import for testing
> FDF2[1:3,]
Date Time Open High Low Close
1 1987.08.28 12:00 1.6238 1.6240 1.6237 1.6239
2 1987.08.28 12:01 1.6239 1.6240 1.6235 1.6236
3 1987.08.28 12:02 1.6236 1.6239 1.6235 1.6238
>
> ID <- data.frame(matrix(1:a, nrow = a, ncol=1 ))
> ID <- as.ffdf(ID)
> names(ID) <- c('ID')
> FDF3 <- cbind.ffdf2(ID, FDF2)
> # Create ID column and binds together
> FDF3[1:3,]
ID Date Time Open High Low Close
1 1 1987.08.28 12:00 1.6238 1.6240 1.6237 1.6239
2 2 1987.08.28 12:01 1.6239 1.6240 1.6235 1.6236
3 3 1987.08.28 12:02 1.6236 1.6239 1.6235 1.6238
The file I will be using this on is an ffdf object, as it is 700mb. I would like to know how I could split the dataset?
My current code is;
T = ffdfdply(FDF3, split(FDF3$ID, rep(1:10,each=10)))
I have done quite a few variation of this and research across the forum and other. However, for simplicity I've just included the above example.
Upon operation the code above gives me the following error;
Error in ffdfdply(FDF3, split(FDF3$ID, rep(1:10, each = 10))) :
split needs to be the same length as the number of rows in x
I can't seem to understand why a split of rep(1:10, each = 10) is not working in a data set that is > dim(FDF3)
[1] 100 7
I would like the split to perform even if there are not a full amount of rows for each split also, lets say: T = ffdfdply(FDF3, split(FDF3$ID, rep(1:10,each=3)))
I've been on this for at least 20 hours.
I couldn't figure out the correct usage of the ffdfdplyr package, and I am still unaware of whether it would have been a correct usage or not. However, I have constructed a work around and hope someone finds it useful. I would add, it is indeed ugly, therefore I'm open to suggestion on how to simply this and would appreciate your comments.
ffdfEnd <- 5
# Variable
ffdfrows = nrow(FDF3)
ffdfStart <- 1
ffdfLoop <- ffdfStart
ffdfSplitSize <- ffdfEnd
# Creates constants and varaibles
splitNum <- ffdfrows/ffdfEnd
# Calculates the number of split required
ffdf.names <- paste('FFDF', ffdfSplitSize, ffdfLoop:splitNum,sep='.')
# Creates names to be pasted to resulting tables
for (i in ffdfLoop:splitNum) {
assign(ffdf.names[i], as.ffdf(FDF3[ffdfStart:ffdfEnd,]))
ffdfStart = (ffdfEnd)
ffdfEnd = (ffdfEnd + ffdfSplitSize)}
# loops over until requirments are fulfilled`
I need to compute a condition over a column date in R. Atable would be:
PIL_final1<-data.frame( prior_day1_cart=c(4,8),
prior_day1_comp=c('2014-06-03','2014-06-07'),
dia_lim_23_cart=c('201-07-30','201-07-30') )
PIL_final1$prior_day1_comp<-as.Date(PIL_final1$prior_day1_comp, format='%Y-%m-%d')
PIL_final1$dia_lim_23_cart<-as.Date(PIL_final1$dia_lim_23_cart, format='%Y-%m-%d')
So I use ifelse:
PIL_final1$llamar_dia<-ifelse(PIL_final1$prior_day1_cart+6>23,
PIL_final1$dia_lim_23_cart ,
PIL_final1$prior_day1_comp+6)
But I get:
> PIL_final1
prior_day1_cart prior_day1_comp dia_lim_23_cart llamar_dia
1 4 2014-06-03 0201-07-30 16230
2 8 2014-06-07 0201-07-30 16234
And if I do:
> PIL_final1$prior_day1_comp+6
[1] "2014-06-09" "2014-06-13"
I get the right results.
How can I do the ifelse and get the date? thanks.
Also if I try this, I still get a number (although different):
> PIL_final1$llamar_dia<-ifelse(PIL_final1$prior_day1_cart+6>23,
+ PIL_final1$dia_lim_23_cart ,
+ as.Date(PIL_final$prior_day1_comp+6,format="%Y-%m-%d"))
> PIL_final1
prior_day1_cart prior_day1_comp dia_lim_23_cart llamar_dia
1 4 2014-06-03 0201-07-30 16376
2 8 2014-06-07 0201-07-30 16377
Edition:
Also if I do this:
> as.Date(ifelse(PIL_final1$prior_day1_cart+6>23, PIL_final1$dia_lim_23_cart ,
+ PIL_final1$prior_day1_comp+6), format="%Y-%m-%d", origin="1970-01-01")
[1] "2014-06-09" "2014-06-13"
I get the right results, but if I replace the ifelse with the vector result, I get the wrong dates:
> PIL_final1$llamar_dia<-ifelse(PIL_final1$prior_day1_cart+6>23,
+ PIL_final1$dia_lim_23_cart ,
+ PIL_final$prior_day1_comp+6)
> as.Date(PIL_final1$llamar_dia, format="%Y-%m-%d", origin="1970-01-01")
[1] "2014-11-02" "2014-11-03"
from ?ifelse :
The mode of the result may depend on the value of test (see the examples), Sometimes it is better >to use a construction such as
ifelse(test, yes, no) ~~ (tmp <- yes; tmp[!test] <- no[!test]; tmp)
Applying this :
dat$d3 <-
with(dat,{
tmp <- d2+6; tmp[!(x+6>23)] <- d1[!(x+6>23)]; tmp
})
dat
x d1 d2 d3
1 4 2014-06-03 0201-07-30 2014-06-03
2 8 2014-06-07 0201-07-30 2014-06-07
Maybe you should modify this to handle missing values in test.
Note I changed the variables names since yours are really long to type and a real source of errors.
dat <- data.frame( x=c(4,8),
d1=c('2014-06-03','2014-06-07'),
d2=c('201-07-30','201-07-30') )
I have a set of hospital admission data that I need to process, I am stuck when trying to loop the data and pick up the stuff I need, here is the example:
Date Ward
1 A
2 A
3 A
4 A B
5 A
6 A
7 A C
8 C
9 C
10 C
And I need them to be transformed into:
Ward Adm_Date Dis_Date
A 1 4
B 4 4
A 4 7
C 7 10
To put it in sentence, this is a admission record patient X who:
go to ward A from day 1 to day 4
go to ward B (maybe it's an ICU ward) for less than a day in day 4, and move back to ward A on that day
stay in ward A from day 4 to day 7
move to ward C from ward A from day 7 and stay in ward C till day 10
I am thinking of using ddply by filtering the ward but it is not OK since B will be "omitted" and the period of time for A is not broken down into 2 pieces.
Any suggestions? Thanks!
dat <- data.frame(Date=1:10,Ward=c(rep("A",3),"A B",rep("A",2),"A C",rep("C",3)))
dat$Ward <- as.character(dat$Ward)
# Change data to a "long" format
Date2 <- rep(dat$Date,nchar(gsub(" ","",dat$Ward)))
Ward2 <- unlist(strsplit(dat$Ward," "))
dat2 <- data.frame(Date=Date2,Ward=Ward2)
dat2$Ward <- as.character(dat2$Ward) # pesky factors!
# Create output
Ward3 <- unlist(strsplit(gsub("(\\w)\\1+","\\1",paste(dat2$Ward,collapse="")),""))
#helper function to find lengths of repeated characters, probably a better way of doing this
repCharLength <- function(str)
{
out <- numeric(0)
tmp <- 1
for (i in 2:length(str))
{
if (str[i]!=str[i-1])
{out<-c(out,tmp)
tmp<-1}
else
tmp <- tmp+1
}
return(c(out,tmp))
}
stays <- repCharLength(dat2$Ward)
Adm_Date <- c(1,dat2$Date[cumsum(stays)[1:(length(stays)-1)]])
Dis_Date <- dat2$Date[cumsum(stays)]
dat3 <- data.frame(Ward=Ward3,Adm_Date=Adm_Date,Dis_Date=Dis_Date)
> dat3
Ward Adm_Date Dis_Date
1 A 1 4
2 B 4 4
3 A 4 7
4 C 7 10
A bit more involved than I first thought, and there is probably a better way to get the stay lengths than using the helper function I wrote, but this seems to do the job.
Edit
In light of Spacedman's comment, there is a library function to calculate Ward3 and stays:
Ward3 <- rle(dat2$Ward)$values
stays <- rle(dat2$Ward)$lengths
It's not a complex answer but you can transform your data
X <- data.frame(
Date=1:10,
Ward=c("A","A","A","A B","A","A","A C","C","C","C"),
stringsAsFactors=FALSE
)
w <- strsplit(X$Ward," +")
n <- sapply(w, length)
X_mod <- data.frame(
Date = rep(X$Date, n),
Ward = unlist(w, FALSE, FALSE)
)
With X_mod you could write vectorized (=fast) solution. For start with(X_mod, c(0,cumsum(Ward[-1]!=Ward[-length(Ward)]))) gives you id of visit.