Remove duplicates from ONE column not row - r

I am trying to remove duplicate emails in a column of my data.frame using duplicate() and distinct() in R however, I do not need it to delete the whole row just the duplicate email addresses in that column. Is there anyway to do that using these? Or is there another way to do this?
library(tidyverse)
patient2 <- c('John Doe','Peter Gynn','Jolie Hope', "Mycroft Holmes", "Carrie
Bird", "Carrie Bird", "Marcus Quimby", "Jennifer Poe", "Donna Moon")
salary2 <- c(21000, 23400, 26800, 40000, 50000, 33000, 24000, 75000, 90000)
email2 <- c("doe#gmail.com", "gynn#gmail.com", "hope#gmail.com",
"holmes#gmail.com", "bird#gmail.com", "bird#gmail.com", "quimby#gmail.com",
"poe#gmail.com", "moon#gmail.com")
startdate2 <- as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2020-7-19',
'2019-4-20', '2018-2-13', '2017-4-21', '2019-6-10', '2010-9-19'))
patient.data_2 <- data.frame(patient2, salary2, email2, startdate2)
print(patient.data_2)
patient2<fctr> salary2<dbl> email2<fctr> startdate2<date>
John Doe 21000 doe#gmail.com 2010-11-01
Peter Gynn 23400 gynn#gmail.com 2008-03-25
Jolie Hope 26800 hope#gmail.com 2007-03-14
Mycroft Holmes 40000 holmes#gmail.com 2020-07-19
Carrie Bird 50000 bird#gmail.com 2019-04-20
Carrie Bird 33000 bird#gmail.com 2018-02-13
Marcus Quimby 24000 quimby#gmail.com 2017-04-21
Jennifer Poe 75000 poe#gmail.com 2019-06-10
Donna Moon 90000 moon#gmail.com 2010-09-19
extracted <- merged_data[!duplicated(merged_data$email), ]
extracted
All I would like to do is remove the extra duplicate email for the person
Carrie Bird. Not the entire row because the date is different. I tried using
duplicated() and distinct() and both removed the entire row.

You could use the duplicated function:
dat <- data.frame(a = c(1, 1, 2, 2, 3, 3, 4, 4, 4, 4))
dat$a[duplicated(dat$a)] <- NA
dat
#> a
#> 1 1
#> 2 NA
#> 3 2
#> 4 NA
#> 5 3
#> 6 NA
#> 7 4
#> 8 NA
#> 9 NA
#> 10 NA

Using dplyr
library(dplyr)
dat <- dat %>%
mutate(a = replace(a, duplicated(a), NA))

Related

Joining Dataframes in R, Matching Patterns in Strings

Two big real life tables to join up, but here's a little reprex:
I've got a table of small strings and I want to left join on a second table, with the join being based on whether or not these small strings can be found inside the bigger strings on the second table.
df_1 <- data.frame(index = 1:5,
keyword = c("john", "ella", "mil", "nin", "billi"))
df_2 <- data.frame(index_2 = 1001:1008,
name = c("John Coltrane", "Ella Fitzgerald", "Miles Davis", "Billie Holliday",
"Nina Simone", "Bob Smith", "John Brown", "Tony Montana"))
df_results_i_want <- data.frame(index = c(1, 1:5),
keyword = c("john", "john", "ella", "mil", "nin", "billi"),
index_2 = c(1001, 1007, 1002, 1003, 1005, 1004),
name = c("John Coltrane", "John Brown", "Ella Fitzgerald",
"Miles Davis", "Nina Simone", "Billie Holliday"))
Seems like a str_detect() call and a left_join() call might be part of the solution - ie I'm hoping for something like:
library(tidyverse)
df_results <- df_1 |> left_join(df_2, join_by(blah blah str_detect() blah blah))
I'm using dplyr 1.1 so I can use join_by(), but I'm not sure of the correct way to get what I need - can anyone help please?
I suppose I could do a simple cross join using tidyr::crossing() and then do the str_detect() stuff afterwards (and filter out things that don't match)
df_results <- df_1 |>
crossing(df_2) |>
mutate(match = str_detect(name, fixed(keyword, ignore_case = TRUE))) |>
filter(match) |>
select(-match)
but in my real life example, the cross join would produce an absolutely enormous table that would overwhelm my PC.
Thank you.
You can try fuzzy_join::regex_join():
library(fuzzyjoin)
regex_join(df_2, df_1, by=c("name"="keyword"), ignore_case=T)
Output:
index.x name index.y keyword
1 1001 John Coltrane 1 john
2 1002 Ella Fitzgerald 2 ella
3 1003 Miles Davis 3 mil
4 1004 Billie Holliday 5 billi
5 1005 Nina Simone 4 nin
6 1007 John Brown 1 john
join_by does not support inexact join (but unequal), but you can use fuzzyjoin:
library(dplyr)
library(fuzzyjoin)
df_2 %>%
mutate(name = tolower(name)) %>%
fuzzy_left_join(df_1, ., by = c(keyword = "name"),
match_fun = \(x, y) str_detect(y, x))
index keyword index_2 name
1 1 john 1001 john coltrane
2 1 john 1007 john brown
3 2 ella 1002 ella fitzgerald
4 3 mil 1003 miles davis
5 4 nin 1005 nina simone
6 5 billi 1004 billie holliday
We can use SQL to do that.
library(sqldf)
sqldf("select * from [df_1] A
left join [df_2] B on B.name like '%' || A.keyword || '%'")
giving:
index keyword index_2 name
1 1 john 1001 John Coltrane
2 1 john 1007 John Brown
3 2 ella 1002 Ella Fitzgerald
4 3 mil 1003 Miles Davis
5 4 nin 1005 Nina Simone
6 5 billi 1004 Billie Holliday
It can be placed in a pipeline like this:
library(magrittr)
library(sqldf)
df_1 %>%
{ sqldf("select * from [.] A
left join [df_2] B on B.name like '%' || A.keyword || '%'")
}

Merge dataframe with a key value that is contained within a string in a separate dataframe

employee <- c('John','Peter', 'Gynn', 'Jolie', 'Hope', 'Sue', 'Jane', 'Sarah')
salary <- c('VT020', 'VT126', 'VT027', 'VT667', 'VC120', 'VT000', 'VA120', 'VA020')
emp <- data.frame(employee, salary)
benefit <- c('Health', 'Time', 'Bonus')
benefit_id <- c('VT020 VT126 VT667 VA020', 'VT667', 'VT126 VT667 VT000')
ben <- data.frame(benefit, benefit_id)
Above we have to dataframes, one contains names and a unique ID, the other contains a category and a list of unique IDs.
What is the most efficient way to merge the ben dataframe with the emp dataframe such that we get the appropriate benefit assigned to each employee?
tidyverse
library(dplyr)
library(tidyr) # tidyr
ben %>%
mutate(benefit_id = strsplit(benefit_id, "\\s+")) %>%
unnest(benefit_id) %>%
left_join(emp, ., by = c(salary = "benefit_id"))
# employee salary benefit
# 1 John VT020 Health
# 2 Peter VT126 Health
# 3 Peter VT126 Bonus
# 4 Gynn VT027 <NA>
# 5 Jolie VT667 Health
# 6 Jolie VT667 Time
# 7 Jolie VT667 Bonus
# 8 Hope VC120 <NA>
# 9 Sue VT000 Bonus
# 10 Jane VA120 <NA>
# 11 Sarah VA020 Health
Depending on your needs, you may also prefer a different join. For instance, use a full_join if you want all pairings, where NA in employee indicates a benefit sans employee.
FYI: if you are running R before 4.0, then you might have factors in your data. To fix that, just convert the factor columns with as.character first. (This can be determined with sapply(ben, inherits, "factor").)
data.table
library(data.table)
setDT(emp)
ben_long <- setDT(ben)[, list(benefit_id = unlist(strsplit(x = benefit_id, split = " "))), by = benefit]
merge(x = emp, y = ben_long, by.x = "salary", by.y = "benefit_id", all.x = TRUE)
salary employee benefit
1: VA020 Sarah Health
2: VA120 Jane <NA>
3: VC120 Hope <NA>
4: VT000 Sue Bonus
5: VT020 John Health
6: VT027 Gynn <NA>
7: VT126 Peter Health
8: VT126 Peter Bonus
9: VT667 Jolie Health
10: VT667 Jolie Time
11: VT667 Jolie Bonus

Add multiple new columns to the dataset, based on another dataset's elements

I have the following products list
> products
# A tibble: 311 x 1
value
<fct>
1 NA
2 Alternativ Economy
3 Ambulant Balance
4 Ambulant Economy
5 Ambulant Premium
6 Ambulant 2
7 Ambulant 3
8 Ambulant 1
9 COMPLETA
10 HOSPITAL ECO
# ... with 301 more rows
and the following df
> df <- data.frame(employee = c('John Doe','Peter Gynn','Jolie Hope'),
+ salary = c(21000, 23400, 26800),
+ startdate = as.Date(c('2010-11-1','2008-3-25','2007-3-14')))
> df
employee salary startdate
1 John Doe 21000 2010-11-01
2 Peter Gynn 23400 2008-03-25
3 Jolie Hope 26800 2007-03-14
Now, I want to add the elements of the former (i.e. products) as variables of the latter (i.e. the df). I use
cbind(df, setNames(lapply(products, function(x) x = NA), products))
but I get an error. Can you suggest another way of doing this? What is wrong with my solution? thanks in advance
Here is one solution.
df <- data.frame(employee = c('John Doe','Peter Gynn','Jolie Hope'),
salary = c(21000, 23400, 26800),
startdate = as.Date(c('2010-11-1','2008-3-25','2007-3-14')))
products <- data.frame(value = c(NA, "Alternativ Economy", "COMPLETA"))
#products$value <- ifelse(is.na(products$value), "not_available", as.character(products$value))
cbind(df, `colnames<-`(data.frame(matrix(ncol = nrow(products), nrow = nrow(df))), products$value))
employee salary startdate NA Alternativ Economy COMPLETA
1 John Doe 21000 2010-11-01 NA NA NA
2 Peter Gynn 23400 2008-03-25 NA NA NA
3 Jolie Hope 26800 2007-03-14 NA NA NA
I question the wisdom of having NAs as column names, so I'd uncomment that one line of code in there to replace NAs with some character string instead.

Create unique list of names

I have a list of actors:
name <- c('John Doe','Peter Gynn','Jolie Hope')
age <- c(26 , 32, 56)
postcode <- c('4011', '5600', '7700')
actors <- data.frame(name, age, postcode)
name age postcode
1 John Doe 26 4011
2 Peter Gynn 32 5600
3 Jolie Hope 56 7700
I also have an edge list of relations:
from <- c('John Doe','John Doe','John Doe', 'Peter Gynn', 'Peter Gynn', 'Jolie Hope')
to <- c('John Doe', 'John Doe', 'Peter Gynn', 'Jolie Hope', 'Peter Gynn', 'Frank Smith')
edge <- data.frame(from, to)
from to
1 John Doe John Doe
2 John Doe John Doe
3 John Doe Peter Gynn
4 Peter Gynn Jolie Hope
5 Peter Gynn Peter Gynn
6 Jolie Hope Frank Smith
First, I want to eliminate self references in my edge list i.e. rows 1,2,5 in my 'edge' dataframe.
non.self.ref <- edge[!(edge$from == edge$to),]
does not produce the desired result.
Second, edge includes a name not in the 'actor' dataframe ('Frank Smith'). I want to add 'Frank Smith' to my 'actor' dataframe, even though I do not have age or postcode data for 'Frank Smith'. For example:
name age postcode
1 John Doe 26 4011
2 Peter Gynn 32 5600
3 Jolie Hope 56 7700
4 Frank Smith NA NA
I would be grateful for a tidy solution!
Here is a tidyverse solution to both parts, though in general try not to ask multiple questions per question.
The first part is fairly simple. filter allows a very intuitive syntax that just specifies you want to keep rows where from isn't equal to to.
The second part is a little more complicated. First we gather up the from and to columns, so all the actors are in one column. Then we use distinct to leave us with a one column tbl with unique actor names. Finally, we can use full_join to combine the tables. A full_join keeps all rows and columns from both tables, matching on shared name column by default, and fills NA if there is no data (as there isn't for Frank).
library(tidyverse)
actors <- tibble(
name = c('John Doe','Peter Gynn','Jolie Hope'),
age = c(26 , 32, 56),
postcode = c('4011', '5600', '7700')
)
edge <- tibble(
from = c('John Doe','John Doe','John Doe', 'Peter Gynn', 'Peter Gynn', 'Jolie Hope'),
to = c('John Doe', 'John Doe', 'Peter Gynn', 'Jolie Hope', 'Peter Gynn', 'Frank Smith')
)
edge %>%
filter(from != to)
#> # A tibble: 3 x 2
#> from to
#> <chr> <chr>
#> 1 John Doe Peter Gynn
#> 2 Peter Gynn Jolie Hope
#> 3 Jolie Hope Frank Smith
edge %>%
gather("to_from", "name", from, to) %>%
distinct(name) %>%
full_join(actors)
#> Joining, by = "name"
#> # A tibble: 4 x 3
#> name age postcode
#> <chr> <dbl> <chr>
#> 1 John Doe 26.0 4011
#> 2 Peter Gynn 32.0 5600
#> 3 Jolie Hope 56.0 7700
#> 4 Frank Smith NA <NA>
Created on 2018-03-02 by the reprex package (v0.2.0).
I discovered by including stringsAsFactors = FALSE e.g.
edge <- data.frame(from, to, stringsAsFactors = F)
then:
non.self.ref <- edge[!(edge$from == edge$to),]
works!
An option with dplyr would be to filter the rows by comparing 'from' and 'to' (to get the first output - it is not needed if we are interested only at the second output), unlist, get the unique values, convert it to a tibble and do a left_join
library(dplyr)
edge %>%
filter(from != to) %>% #get the results for the first question
unlist %>%
unique %>%
tibble(name = .) %>%
left_join(actors) # second output
# A tibble: 4 x 3
# name age postcode
# <chr> <dbl> <fctr>
#1 John Doe 26.0 4011
#2 Peter Gynn 32.0 5600
#3 Jolie Hope 56.0 7700
#4 Frank Smith NA <NA>

Unpacking and merging lists in a column in data.frame

I have the following data.frame:
id name altNames
1001 Joan character(0)
1002 Jane c("Janie", "Janet", "Jan")
1003 John Jon
1004 Bill Will
1005 Tom character(0)
The column altNames could be empty (i.e. character(0)), have just one name, or a list of names. What I want is a data.frame (or a list) where each entry from name and/or altNames appears just once along with the corresponding id, like this:
id name
1001 Joan
1002 Jane
1002 Janie
1002 Janet
1002 Jan
1003 John
1003 Jon
1004 Bill
1004 Will
1005 Tom
What's the most efficient way of doing it? Even better is dplyr is utilized.
Thanks
Edit: Here's the data:
df <- data_frame(
id = c("1001", "1002","1003", "1004", "1005"),
name = c("Joan", "Jane", "John", "Bill", "Tom"),
altNames = list(character(0), c("Janie", "Janet", "Jan"), "Jon", "Will", character(0))
)
Here's a possible data.table approach
library(data.table)
setDT(dat)[, .(name = c(name, unlist(altNames))), by = id]
# id name
# 1: 1001 Joan
# 2: 1002 Jane
# 3: 1002 Janie
# 4: 1002 Janet
# 5: 1002 Jan
# 6: 1003 John
# 7: 1003 Jon
# 8: 1004 Bill
# 9: 1004 Will
# 10: 1005 Tom
A base R version (using the df added by #rawr)
with(df, {
ns <- mapply(c, name, altNames)
data.frame(id = rep(id, times=lengths(ns)), name=unlist(ns), row.names=NULL)
})
# id name
#1 1001 Joan
#2 1002 Jane
#3 1002 Janie
#4 1002 Janet
#5 1002 Jan
#6 1003 John
#7 1003 Jon
#8 1004 Bill
#9 1004 Will
#10 1005 Tom
Here's a full dplyr + tidyr solution, the way I'd tackle it:
library(dplyr)
library(tidyr)
df <- data_frame(
id = c("1001", "1002","1003", "1004", "1005"),
name = c("Joan", "Jane", "John", "Bill", "Tom"),
altNames = list(character(0), c("Janie", "Janet", "Jan"), "Jon", "Will", character(0))
)
# Need some way to concatenate a list of vectors with a vectors
# in a "rowwise" way
vector_c <- function(...) {
Map(c, ...)
}
df %>%
mutate(
names = vector_c(name, altNames),
altNames = NULL,
name = NULL
) %>%
unnest(names)
#> Source: local data frame [10 x 2]
#>
#> id names
#> 1 1001 Joan
#> 2 1002 Jane
#> 3 1002 Janie
#> 4 1002 Janet
#> 5 1002 Jan
#> 6 1003 John
#> 7 1003 Jon
#> 8 1004 Bill
#> 9 1004 Will
#> 10 1005 Tom
Most of the hard work is done by tidyr::unnest(): it's designed to take data frame with a list-column and unnest it, repeating the other columns as needed.
Using tidyr, after cleaning the data with data.table:
First, fix the data:
library(data.table)
dat<-setDT(dat)
dat$altNames[sapply(dat$altNames, length) == 0] <- NA
Now unnest from tidyr and some dplyr:
library(dplyr)
library(tidyr)
dat %>% unnest(altNames) %>%
group_by(id) %>%
do(unique(c(.[["name"]],.[["altNames"]])))
id V1
1 1001 Joan
2 1001 NA
3 1002 Jane
4 1002 Janie
5 1002 Janet
6 1002 Jan
7 1003 John
8 1003 Jon
9 1004 Bill
10 1004 Will
11 1005 Tom
12 1005 NA
it has the NAs, but they are easily removed with %>% na.omit.
I think data.table is the winner on this one.

Resources