Separate column into two: before and after a certain word - r

I have the following data set
> data
firm_name
1: Light Ltd John Smith
2: Bolt Night Ltd Mary Poppins
3: Bright Yellow Sun Ltd Harry Potter
---
I want to separate it into two columns depending on the position of the "Ltd". So, the data would look like:
> data
firm_name name
1: Light Ltd John Smith
2: Bolt Night Ltd Mary Poppins
3: Bright Yellow Sun Ltd Harry Potter
---
I tried with the stringr package but did not find any particular solution.
thanks in advance

You can use separate from tidyr with a lookbehind regular expression for this.
library(tidyr)
df %>%
separate(col = firm_name, into = c("firm_name", "name"), sep = "(?<=Ltd)")
#> firm_name name
#> 1 Light Ltd John Smith
#> 2 Bolt Night Ltd Mary Poppins
#> 3 Bright Yellow Sun Ltd Harry Potter
data
df <- data.frame(firm_name = c("Light Ltd John Smith",
"Bolt Night Ltd Mary Poppins",
"Bright Yellow Sun Ltd Harry Potter"))

We can use base R with read.csv
read.csv(text = sub("(Ltd)", "\\1,", df$names),
header = FALSE, col.names = c('firm_name', 'name'))
# firm_name name
#1 Light Ltd John Smith
#2 Bolt Night Ltd Mary Poppins
#3 Bright Yellow Sun Ltd Harry Potter
data
df <- structure(list(names = c("Light Ltd John Smith",
"Bolt Night Ltd Mary Poppins",
"Bright Yellow Sun Ltd Harry Potter")), row.names = c(NA, -3L
), class = "data.frame")

Are you after something like this?
df <-
tibble(
names = c("Light Ltd John Smith",
"Bolt Night Ltd Mary Poppins",
"Bright Yellow Sun Ltd Harry Potter")
)
df %>%
tidyr::separate(names, c("half_1", "half_2"), sep = "Ltd")

Does this work:
> df %>% mutate(name = gsub('([A-z].*Ltd) (.*)','\\2', df$firm_name), firm_name = gsub('([A-z].*Ltd) (.*)','\\1', df$firm_name))
# A tibble: 3 x 2
firm_name name
<chr> <chr>
1 Light Ltd John Smith
2 Bolt Night Ltd Mary Poppins
3 Bright Yellow Sun Ltd Harry Potter
>
Data used:
> df
# A tibble: 3 x 1
firm_name
<chr>
1 Light Ltd John Smith
2 Bolt Night Ltd Mary Poppins
3 Bright Yellow Sun Ltd Harry Potter
>

Using tidyr::extract :
tidyr::extract(df, names, c('firm_name', 'name'), regex = '(.*Ltd)\\s(.*)')
# A tibble: 3 x 2
# firm_name name
# <chr> <chr>
#1 Light Ltd John Smith
#2 Bolt Night Ltd Mary Poppins
#3 Bright Yellow Sun Ltd Harry Potter
Or in base R :
df$name <- sub('.*Ltd\\s', '', df$names)
df$firm_name <- sub('(.*Ltd).*', '\\1', df$names)
df$names <- NULL

Another base R option
setNames(
data.frame(
do.call(
rbind,
strsplit(df$names, "(?<=Ltd)\\s+", perl = TRUE)
)
),
c("firm_name", "name")
)
giving
firm_name name
1 Light Ltd John Smith
2 Bolt Night Ltd Mary Poppins
3 Bright Yellow Sun Ltd Harry Potter

Related

How can I check efficiently check variables for a particular value in R and flag rows containing it?

I want to create a variable that flags whether one or more of multiple variables has a particular value.
week Mon Tues Weds Thurs Fri Sat
1 jon jon jon jon mary mary
2 jane jane jane jane jane jane
3 mary mary mary mary mary jane
I want to create a binary variable that flags for each week whether Mon, Weds, or Sat of that week == "jon" or "mary" Is there a way to do this without creating a long ifelse statement that checks each variable individually?
week Mon Tues Weds Thurs Fri Sat flag
1 jon jon jon jon mary mary 1
2 jane jane jane jane jane jane 0
3 mary mary mary mary mary jane 1
I tried
df %>%
rowwise() %>%
mutate(flag = +any(c_across(Mon, Weds, Sat)
%in% ("jon", "mary")) %>%
ungroup()
but I get an error
Error: Problem with `mutate()` input `flag`.
x unused arguments (Mon, Weds, Sat)
i Input `flag` is `+...`.
i The error occurred in row 1.
df %>%
mutate(flag = colSums(apply(cbind(Mon, Weds, Sat), 1, `%in%`, c("jon", "mary"))) > 0)
# week Mon Tues Weds Thurs Fri Sat flag
# 1 1 jon jon jon jon mary mary TRUE
# 2 2 jane jane jane jane jane jane FALSE
# 3 3 mary mary mary mary mary jane TRUE
I think the problem with across is that it's trying to do something to each column, not a summary of sorts of all of them. Let's try purrr::pmap insteadL
library(purrr)
df %>%
mutate(flag = pmap(list(Mon, Weds, Sat),
~ +any(unlist(...) %in% c("jon", "mary"))))
# week Mon Tues Weds Thurs Fri Sat flag
# 1 1 jon jon jon jon mary mary 1
# 2 2 jane jane jane jane jane jane 0
# 3 3 mary mary mary mary mary jane 1
A third (using your request for c_across):
df %>%
rowwise() %>%
mutate(flag = +any(c_across(c(Mon, Weds, Sat)) %in% c("jon", "mary"))) %>%
ungroup()
# # A tibble: 3 x 8
# week Mon Tues Weds Thurs Fri Sat flag
# <int> <chr> <chr> <chr> <chr> <chr> <chr> <int>
# 1 1 jon jon jon jon mary mary 1
# 2 2 jane jane jane jane jane jane 0
# 3 3 mary mary mary mary mary jane 1
Instead of the rowwise or looping over the rows, we can make it more efficient if we loop over the columns with map and reduce it
library(purrr)
library(dplyr)
df %>%
mutate(flag = map(select(., Mon, Weds, Sat), `%in%`, c("jon", "mary")) %>%
reduce(`|`) %>% `+`)
# week Mon Tues Weds Thurs Fri Sat flag
#1 1 jon jon jon jon mary mary 1
#2 2 jane jane jane jane jane jane 0
#3 3 mary mary mary mary mary jane 1
A corresponding option in base R is lapply/Reduce
df$flag <- +(Reduce(`|`, lapply(df[c('Mon', 'Weds', 'Sat')],
`%in%`, c("jon", "mary"))))
data
df <- structure(list(week = 1:3, Mon = c("jon", "jane", "mary"), Tues = c("jon",
"jane", "mary"), Weds = c("jon", "jane", "mary"), Thurs = c("jon",
"jane", "mary"), Fri = c("mary", "jane", "mary"), Sat = c("mary",
"jane", "jane")), class = "data.frame", row.names = c(NA, -3L
))
Here is another base R option using rowSums + Reduce
df$flag <- +(rowSums(
Reduce(
`+`,
lapply(
c("jon", "mary"),
`==`,
df[c("Mon", "Weds", "Sat")]
)
)
) > 0)
such that
week Mon Tues Weds Thurs Fri Sat flag
1 1 jon jon jon jon mary mary 1
2 2 jane jane jane jane jane jane 0
3 3 mary mary mary mary mary jane 1

Expand data.table so one row per pattern match of each ID

I have a lot of text data in a data.table. I have several text patterns that I'm interested in. I have managed to subset the table so it shows text that matches at least two of the patterns (relevant question here).
I now want to be able to have one row per match, with an additional column that identifies the match - so rows where there are multiple matches will be duplicates apart from that column.
It feels like this shouldn't be too hard but I'm struggling! My vague thoughts are around maybe counting the number of pattern matches, then duplicating the rows that many times...but then I'm not entirely sure how to get the label for each different pattern...(and also not sure that is very efficient anyway).
Thanks for your help!
Example data
library(data.table)
library(stringr)
text_table <- data.table(ID = (1:5),
text = c("lucy, sarah and paul live on the same street",
"lucy has only moved here recently",
"lucy and sarah are cousins",
"john is also new to the area",
"paul and john have known each other a long time"))
text_patterns <- as.character(c("lucy", "sarah", "paul|john"))
# Filtering the table to just the IDs with at least two pattern matches
text_table_multiples <- text_table[, Reduce(`+`, lapply(text_patterns,
function(x) str_detect(text, x))) >1]
Ideal output
required_table <- data.table(ID = c(1, 1, 1, 2, 3, 3, 4, 5),
text = c("lucy, sarah and paul live on the same street",
"lucy, sarah and paul live on the same street",
"lucy, sarah and paul live on the same street",
"lucy has only moved here recently",
"lucy and sarah are cousins",
"lucy and sarah are cousins",
"john is also new to the area",
"paul and john have known each other a long time"),
person = c("lucy", "sarah", "paul or john", "lucy", "lucy", "sarah", "paul or john", "paul or john"))
A way to do that is to create a variable for each indicator and melt:
library(stringi)
text_table[, lucy := stri_detect_regex(text, 'lucy')][ ,
sarah := stri_detect_regex(text, 'sarah')
][ ,`paul or john` := stri_detect_regex(text, 'paul|john')
]
melt(text_table, id.vars = c("ID", "text"))[value == T][, -"value"]
## ID text variable
## 1: 1 lucy, sarah and paul live on the same street lucy
## 2: 2 lucy has only moved here recently lucy
## 3: 3 lucy and sarah are cousins lucy
## 4: 1 lucy, sarah and paul live on the same street sarah
## 5: 3 lucy and sarah are cousins sarah
## 6: 1 lucy, sarah and paul live on the same street paul or john
## 7: 4 john is also new to the area paul or john
## 8: 5 paul and john have known each other a long time paul or john
A tidy way of doing the same procedure is:
library(tidyverse)
text_table %>%
mutate(lucy = stri_detect_regex(text, 'lucy')) %>%
mutate(sarah = stri_detect_regex(text, 'sarah')) %>%
mutate(`paul or john` = stri_detect_regex(text, 'paul|john')) %>%
gather(value = value, key = person, - c(ID, text)) %>%
filter(value) %>%
select(-value)
DISCLAIMER: this is not an idiomatic data.table solution
I would build a helper function like the following, that take a single row and an input and returns a new dt with Nrows:
library(data.table)
library(tidyverse)
new_rows <- function(dtRow, patterns = text_patterns){
res <- map(text_patterns, function(word) {
textField <- grep(x = dtRow[1, text], pattern = word, value = TRUE) %>%
ifelse(is.character(.), ., NA)
personField <- str_extract(string = dtRow[1, text], pattern = word) %>%
ifelse( . == "paul" | . == "john", "paul or john", .)
idField <- ifelse(is.na(textField), NA, dtRow[1, ID])
data.table(ID = idField, text = textField, person = personField)
}) %>%
rbindlist()
res[!is.na(text), ]
}
And I will execute it:
split(text_table, f = text_table[['ID']]) %>%
map_df(function(r) new_rows(dtRow = r))
The answer is:
ID text person
1: 1 lucy, sarah and paul live on the same street lucy
2: 1 lucy, sarah and paul live on the same street sarah
3: 1 lucy, sarah and paul live on the same street paul or john
4: 2 lucy has only moved here recently lucy
5: 3 lucy and sarah are cousins lucy
6: 3 lucy and sarah are cousins sarah
7: 4 john is also new to the area paul or john
8: 5 paul and john have known each other a long time paul or john
which looks like your required_table (duplicated IDs included)
ID text person
1: 1 lucy, sarah and paul live on the same street lucy
2: 1 lucy, sarah and paul live on the same street sarah
3: 1 lucy, sarah and paul live on the same street paul or john
4: 2 lucy has only moved here recently lucy
5: 3 lucy and sarah are cousins lucy
6: 3 lucy and sarah are cousins sarah
7: 4 john is also new to the area paul or john
8: 5 paul and john have known each other a long time paul or john

Replace multiple strings/values based on separate list

I have a data frame that looks similar to this:
EVENT ID GROUP YEAR X.1 X.2 X.3 Y.1 Y.2 Y.3
1 1 John Smith GROUP1 2015 1 John Smith 5 Adam Smith 12 Mike Smith 20 Sam Smith 7 Luke Smith 3 George Smith
Each row repeats for new logs, but the values in X.1 : Y.3 change often.
The ID's and the ID's present in X.1 : Y.3 have a numeric value and then the name ID, i.e., "1 John Smith" or "20 Sam Smith" will be the string.
I have an issue where in certain instances, the ID will remain as "1 John Smith" but in X.1 : Y.3 the number may change preceding "John Smith", so for example it might be "14 John Smith". The names will always be correct, it's just the number that sometimes gets mixed up.
I have a list of 200+ ID's that are impacted by this mismatch - what is the most efficient way to replace the values in X.1 : Y.3 so that they match the correct ID in column ID?
I won't know which column "14 John Smith" shows up in, it could be X.1, or Y.2, or Y.3 depending on the row.
I can use a replace function in a dplyr line of code, or gsub for each 200+ ID's and for each column effected, but it seems very inefficient. Is there a quicker way than repeated something like the below x times?
df%>%mutate(X.1=replace(X.1, grepl('John Smith', X.1), "1 John Smith"))%>%as.data.frame()
Sometimes it helps to temporarily reshape the data. That way we can operate on all the X and Y values without iterating over them.
library(stringr)
library(tidyr)
## some data to work with
exd <- read.csv(text = "EVENT,ID,GROUP,YEAR,X.1,X.2,X.3,Y.1,Y.2,Y.3
1,1 John Smith,GROUP1,2015,19 John Smith,11 Adam Smith,9 Sam Smith,5 George Smith,13 Mike Smith,12 Luke Smith
2,2 John Smith,GROUP1,2015,1 George Smith,9 Luke Smith,19 Adam Smith,7 Sam Smith,17 Mike Smith,11 John Smith
3,3 John Smith,GROUP1,2015,5 George Smith,18 John Smith,12 Sam Smith,6 Luke Smith,2 Mike Smith,4 Adam Smith",
stringsAsFactors = FALSE)
## re-arrange to put X and Y columns into a single column
exd <- gather(exd, key = "var", value = "value", X.1, X.2, X.3, Y.1, Y.2, Y.3)
## find the X and Y values that contain the ID name
matches <- str_detect(exd$value, str_replace_all(exd$ID, "^\\d+ *", ""))
## replace X and Y values with the matching ID
exd[matches, "value"] <- exd$ID[matches]
## put it back in the original shape
exd <- spread(exd, key = "var", value = value)
exd
## EVENT ID GROUP YEAR X.1 X.2 X.3 Y.1 Y.2 Y.3
## 1 1 1 John Smith GROUP1 2015 1 John Smith 11 Adam Smith 9 Sam Smith 5 George Smith 13 Mike Smith 12 Luke Smith
## 2 2 2 John Smith GROUP1 2015 1 George Smith 9 Luke Smith 19 Adam Smith 7 Sam Smith 17 Mike Smith 2 John Smith
## 3 3 3 John Smith GROUP1 2015 5 George Smith 3 John Smith 12 Sam Smith 6 Luke Smith 2 Mike Smith 4 Adam Smith
Not sure if you're set on dplyr and piping, but I think this is a plyr solution that does what you need. Given this example dataset:
> df
EVENT ID GROUP YEAR X.1 X.2 X.3 Y.1 Y.2 Y.3
1 1 1 John Smith GROUP1 2015 19 John Smith 11 Adam Smith 9 Sam Smith 5 George Smith 13 Mike Smith 12 Luke Smith
2 2 2 John Smith GROUP1 2015 1 George Smith 9 Luke Smith 19 Adam Smith 7 Sam Smith 17 Mike Smith 11 John Smith
3 3 3 John Smith GROUP1 2015 5 George Smith 18 John Smith 12 Sam Smith 6 Luke Smith 2 Mike Smith 4 Adam Smith
This adply function goes row by row and replaces any matching X:Y column values with the one from the ID column:
library(plyr)
adply(df, .margins = 1, function(x) {
idcol <- as.character(x$ID)
searchname <- trimws(gsub('[[:digit:]]+', "", idcol))
sapply(x[5:10], function(y) {
ifelse(grepl(searchname, y), idcol, as.character(y))
})
})
Output:
EVENT ID GROUP YEAR X.1 X.2 X.3 Y.1 Y.2 Y.3
1 1 1 John Smith GROUP1 2015 1 John Smith 11 Adam Smith 9 Sam Smith 5 George Smith 13 Mike Smith 12 Luke Smith
2 2 2 John Smith GROUP1 2015 1 George Smith 9 Luke Smith 19 Adam Smith 7 Sam Smith 17 Mike Smith 2 John Smith
3 3 3 John Smith GROUP1 2015 5 George Smith 3 John Smith 12 Sam Smith 6 Luke Smith 2 Mike Smith 4 Adam Smith
Data:
names <- c("EVENT","ID",'GROUP','YEAR', paste(rep(c("X.", "Y."), each = 3), 1:3, sep = ""))
first <- c("John", "Sam", "Adam", "Mike", "Luke", "George")
set.seed(2017)
randvals <- t(sapply(1:3, function(x) paste(sample(1:20, size = 6),
paste(sample(first, replace = FALSE, size = 6), "Smith"))))
df <- cbind(data.frame(1:3, paste(1:3, "John Smith"), "GROUP1", 2015), randvals)
names(df) <- names
I think that the most efficient way to accomplish this is by building a loop. The reason is that you will have to repeat the function to replace the names for every name in your ID list. With a loop, you can automate this.
I will make some assumptions first:
The ID list can be read as a character vector
You don't have any typos in the ID list or in your data.frame, including
different lowercase and uppercase letters in the names.
Your ID list does not contain the numbers. In case that it does contain numbers, you have to use gsub to erase them.
The example can work with a data.frame (DF) with the same structure that
you put in your question.
>
ID <- c("John Smith", "Adam Smith", "George Smith")
for(i in 1:length(ID)) {
DF[, 5:10][grep(ID[i], DF[, 5:10])] <- ID[i]
}
With each round this loop will:
Identify the positions in the columns X.1:Y.3 (columns 5 to 10 in your question) where the name "i" appears.
Then, it will change all those values to the one in the "i" position of the ID vector.
So, the first iteration will do: 1) Search for every position where the name "John Smith" appears in the data frame. 2) Replace all those "# John Smith" with "John Smith".
Note: If you simply want to delete the numbers, you can use gsub to replace them. Take into account that you probably want to erase the first space between the number and the name too. One way to do this is using gsub and a regular expression:
DF[, 5:10] <- gsub("[0-9]+ ", "", DF[, 5:10])

Match text across multiple rows in R

My data.frame(Networks) contains the following:
Location <- c("Farm", "Supermarket", "Farm", "Conference",
"Supermarket", "Supermarket")
Instructor <- c("Bob", "Bob", "Louise", "Sally", "Lee", "Jeff")
Operator <- c("Lee", "Lee", "Julie", "Louise", "Bob", "Louise")
Networks <- data.frame(Location, Instructor, Operator, stringsAsFactors=FALSE)
MY QUESTION
I wish to include a new column Transactions$Count in a new data.frame Transactions that sums the exchanges between each Instructor and Operator for every Location
EXPECTED OUTPUT
Location <- c("Farm", "Supermarket", "Farm", "Conference", "Supermarket")
Person1 <- c("Bob", "Louise", "Sally", "Jeff")
Person2 < - c("Lee", "Julie", "Louise", "Louise")
Count < - c(1, 2, 1, 1, 1)
Transactions <- data.frame(Location, Person1, Person2, Count,
stringsAsFactors=FALSE)
For example, there would be a total of 2 exchanges between Bob and Lee at the Supermarket. It does not matter if one person is a instructor or operator, I am interested in their exchange. In the expected output, the two exchanges between Bob and Lee at the Supermarket are noted. There is one exchange for every other combination at the other locations.
WHAT I HAVE TRIED
I thought grepl may be of use, but I wish to iterate across 1300 rows of this data, so it may be computationally expensive.
Thank you.
You can consider using "data.table" and use pmin and pmax in your "by" argument.
Example:
Networks <- data.frame(Location, Instructor, Operator, stringsAsFactors = FALSE)
library(data.table)
as.data.table(Networks)[
, TransCount := .N,
by = list(Location,
pmin(Instructor, Operator),
pmax(Instructor, Operator))][]
# Location Instructor Operator TransCount
# 1: Farm Bob Lee 1
# 2: Supermarket Bob Lee 2
# 3: Farm Louise Julie 1
# 4: Conference Sally Louise 1
# 5: Supermarket Lee Bob 2
# 6: Supermarket Jeff Louise 1
Based on your update, it sounds like this might be more appropriate for you:
as.data.table(Networks)[
, c("Person1", "Person2") := list(
pmin(Instructor, Operator),
pmax(Instructor, Operator)),
by = 1:nrow(Networks)
][
, list(TransCount = .N),
by = .(Location, Person1, Person2)
]
# Location Person1 Person2 TransCount
# 1: Farm Bob Lee 1
# 2: Supermarket Bob Lee 2
# 3: Farm Julie Louise 1
# 4: Conference Louise Sally 1
# 5: Supermarket Jeff Louise 1
You may try
library(dplyr)
Networks %>%
group_by(Location, Person1=pmin(Instructor,Operator),
Person2= pmax(Instructor,Operator)) %>%
summarise(Count=n())
# Location Person1 Person2 Count
#1 Conference Louise Sally 1
#2 Farm Bob Lee 1
#3 Farm Julie Louise 1
#4 Supermarket Bob Lee 2
#5 Supermarket Jeff Louise 1
Or using base R
d1 <-cbind(Location=Networks[,1],
data.frame(setNames(Map(do.call, c('pmin', 'pmax'),
list(Networks[-1])), c('Person1', 'Person2'))))
aggregate(cbind(Count=1:nrow(d1))~., d1, FUN=length)
# Location Person1 Person2 Count
#1 Farm Bob Lee 1
#2 Supermarket Bob Lee 2
#3 Supermarket Jeff Louise 1
#4 Farm Julie Louise 1
#5 Conference Louise Sally 1
data
Networks <- data.frame(Location, Instructor, Operator,
stringsAsFactors=FALSE)

variable value occuring on 2 dates R

I want to find who had an apple or an orange on at least 2 different (unique) dates. I would like to create a new column with a binary indicator for whether an individual had an orange or an apple on at least two dates (1=yes, 0=no).
The nearest I've come is this plyr code.
df1<- ddply(df, .(names, fruit), mutate, acne = ifelse(fruit=="apple" | fruit=="orange" & length(unique(dates))>=2,1,0))
This is not the solution however. anne gets apples twice but on the same date, so she should not get a 1 here. Similarly ted gets a 1, even though he only got an apple once.
This is closer, but still not correct. It gives a 1 to any fruit that has occurred twice. Need the fruit to occur twice per person on two individual dates per person
df2<- ddply(df, .(fruit), mutate, acne = ifelse(length(unique(dates))>=2, 1, 0
##this one gives a 1 to any fruit that has occurred twice. Need the fruit to occur twice per person on two individual dates per person.
If anyone could point me in the right direction here I would be very grateful.
Thank you in advance
SAMPLE DF
names<-as.character(c("john", "john", "philip", "ted", "john", "john", "anne", "john", "mary","anne", "mary","mary","philip","mary", "su","mary", "jim", "sylvia", "mary", "ted","ted","mary", "sylvia", "jim", "ted", "john", "ted"))
dates<-as.Date(c("2010-07-01", "2010-07-13", "2010-05-12","2010-02-14","2010-06-30","2010-08-15", "2010-03-21","2010-04-04","2010-09-01", "2010-03-21", "2010-12-01", "2011-01-01", "2010-08-12", "2010-11-11", "2010-05-12", "2010-12-03", "2010-07-12", "2010-12-21", "2010-02-18", "2010-10-29", "2010-08-13", "2010-11-11", "2010-05-12", "2010-04-01", "2010-05-06", "2010-09-28", "2010-11-28" ))
fruit<-as.character(c("kiwi","apple","mango", "banana","strawberry","orange","apple","raspberry", "orange","apple","orange", "apple", "strawberry", "apple", "pineapple", "peach", "orange", "nectarine", "grape","banana", "melon", "apricot", "plum", "lychee", "mango", "watermelon", "apple" ))
df<-data.frame(names,dates,fruit)
df
Desired ouput
names dates fruit v1
7 anne 2010-03-21 apple 0
10 anne 2010-03-21 apple 0
17 jim 2010-07-12 orange 0
24 jim 2010-04-01 lychee 0
1 john 2010-07-01 kiwi 1
2 john 2010-07-13 apple 1
5 john 2010-06-30 strawberry 1
6 john 2010-08-15 orange 1
8 john 2010-04-04 raspberry 1
26 john 2010-09-28 watermelon 1
9 mary 2010-09-01 orange 1
11 mary 2010-12-01 orange 1
12 mary 2011-01-01 apple 1
14 mary 2010-11-11 apple 1
16 mary 2010-12-03 peach 1
19 mary 2010-02-18 grape 1
22 mary 2010-11-11 apricot 1
3 philip 2010-05-12 mango 0
13 philip 2010-08-12 strawberry 0
15 su 2010-05-12 pineapple 0
18 sylvia 2010-12-21 nectarine 0
23 sylvia 2010-05-12 plum 0
4 ted 2010-02-14 banana 0
20 ted 2010-10-29 banana 0
21 ted 2010-08-13 melon 0
25 ted 2010-05-06 mango 0
27 ted 2010-11-28 apple 0
this should probably do the trick:
v1 = ave(1:nrow(df),df$names,FUN=function(x) length(unique(df$dates[x[df$fruit[x]
%in% c("orange","apple")]]))>1)
df$v1 = v1
df = df[order(df$names),]
If I understood correctly, for the purpose of your problem, apples == oranges. So the plan is
to (1) create a small data.frame where fruits are oranges or apples only, as you don't care about other fruits, (b) select only unique date/name rows, (c) aggregate by name and (d) merge back to your original data.frame to get your result:
ndf <- subset(df, fruit %in% c("apple", "orange"))
ndf <- ndf[!duplicated(ndf[, c("names", "dates")]), ]
Here you can use table, but I prefer aggregate
v <- aggregate(rep(1, nrow(ndf)), by = ndf[, "names", drop = FALSE], sum)
v$x <- ifelse(v$x > 1, 1, 0)
rv <- merge(df, v)
It is a bit longer, codewise, than other answers but clear and most certainly does the job.
You could just use aggregate without the first two parts, but if you have huge data.frame, with lots of names aggregating for every name can prove very costly.
I did something similar to #amit's solution using by. Rownames got mangled during do.call, but you can fix that.
result <- by(df, INDICES = df$names, FUN = function(x) {
if (length(unique(x$dates)) == 1) {
x$index <- 0
return(x)
}
ao.sum <- sum(x$fruit %in% c("apple", "orange"))
if (ao.sum < 2) x$index <- 0 else x$index <- 1
x
})
do.call("rbind", result)
names dates fruit index
anne.7 anne 2010-03-21 apple 0
anne.10 anne 2010-03-21 apple 0
jim.17 jim 2010-07-12 orange 0
jim.24 jim 2010-04-01 lychee 0
john.1 john 2010-07-01 kiwi 1
john.2 john 2010-07-13 apple 1
john.5 john 2010-06-30 strawberry 1
john.6 john 2010-08-15 orange 1
john.8 john 2010-04-04 raspberry 1
john.26 john 2010-09-28 watermelon 1
mary.9 mary 2010-09-01 orange 1
mary.11 mary 2010-12-01 orange 1
mary.12 mary 2011-01-01 apple 1
mary.14 mary 2010-11-11 apple 1
mary.16 mary 2010-12-03 peach 1
mary.19 mary 2010-02-18 grape 1
mary.22 mary 2010-11-11 apricot 1
philip.3 philip 2010-05-12 mango 0
philip.13 philip 2010-08-12 strawberry 0
su su 2010-05-12 pineapple 0
sylvia.18 sylvia 2010-12-21 nectarine 0
sylvia.23 sylvia 2010-05-12 plum 0
ted.4 ted 2010-02-14 banana 0
ted.20 ted 2010-10-29 banana 0
ted.21 ted 2010-08-13 melon 0
ted.25 ted 2010-05-06 mango 0
ted.27 ted 2010-11-28 apple 0

Resources