Joining 2 data frames - r

I have 2 data frames and want to put a match column on one of them
library(plyr)
d1<-data.frame(date=c("2015-01-01","2015-02-05"),s= c("b","s"),name=c("bob","frank"),number=c(10,10.44), MatchorNoMatch= as.character(c("","")))
d1
d2<-data.frame(date2=c("2015-01-01","2015-02-06"),s2= c("b","b"),name2=c("bob","george"),number2=c(10,114))
d2
d1[d1$date %in% d2$date2 & d1$s %in% d2$s2 & d1$name %in% d2$name2 & d1$number %in% d2$number2,"MatchorNoMatch"] <- "match"
d1
here is what I get when I run that:
> library(plyr)
> d1<-data.frame(date=c("2015-01-01","2015-02-05"),s= c("b","s"),name=c("bob","frank"),number=c(10,10.44), MatchorNoMatch= as.character(c("","")))
> d1
date s name number MatchorNoMatch
1 2015-01-01 b bob 10.00
2 2015-02-05 s frank 10.44
> d2<-data.frame(date2=c("2015-01-01","2015-02-06"),s2= c("b","b"),name2=c("bob","george"),number2=c(10,114))
> d2
date2 s2 name2 number2
1 2015-01-01 b bob 10
2 2015-02-06 b george 114
>
> d1[d1$date %in% d2$date2 & d1$s %in% d2$s2 & d1$name %in% d2$name2 & d1$number %in% d2$number2,"MatchorNoMatch"] <- "match"
Warning message:
In `[<-.factor`(`*tmp*`, iseq, value = "match") :
invalid factor level, NA generated
> d1
date s name number MatchorNoMatch
1 2015-01-01 b bob 10.00 <NA>
2 2015-02-05 s frank 10.44
I am getting a NA in the MatchOrNoMatch column. Any idea?
===========ACTAUALLY I JUST NEEDE TO PUT stringASFactors = FALSE
here is why using %in% won't work. Bob shoudl not be a match
library(plyr)
d1<-data.frame(date=c("2015-01-01","2015-02-05","2015-01-01"),s= c("b","s","s"),name=c("bob","frank","g"),number=c(10,10.44,66), match= as.character(c("","","")),stringsAsFactors= FALSE)
d1
class(d1$match)
d2<-data.frame(date2=c("2015-01-15","2015-02-05","2015-01-01"),s2= c("b","s","s"),name2=c("bob","frank","g"),number2=c(10,10.44,55),stringsAsFactors= FALSE)
d2
d1[d1$date %in% d2$date2 & d1$s %in% d2$s2 & d1$name %in% d2$name2 & d1$number %in% d2$number2,"match"] <- d2[d1$date %in% d2$date2 & d1$s %in% d2$s2 & d1$name %in% d2$name2 & d1$number %in% d2$number2, "name2"]
d1

This is really easy to do with just the base merge command from R.
d2$name2<-d2$name
merge(d1,d2,all.x=TRUE)
date s name number name2
1 2015-01-01 b bob 10.00 bob
2 2015-02-05 s frank 10.44 <NA>
merge(d1,d2,by=c("date","s","name","number"),all.x=TRUE)
edited in your specific column names that you wanted to match by.

Related

Modify multiple columns at same time in R

I don't know how to say it clearly, that is maybe why i did not find the answer, but i want to edit the values of two different columns at the same time, while they are the identifying columns.
For example this is the data :
> data = data.frame(name1 = c("John","Jake","John","Paul"),
name2 = c("Paul", "Paul","John","John"),
value1 = c(0,0,1,0),
value2 = c(1,0,1,0))
> data
name1 name2 value1 value2
1 John Paul 0 1
2 Jake Paul 0 0
3 John John 1 1
4 Paul John 0 0
I would like to edit the values of the first row so the first row become Jake & John instead of John & Paul, and so i would like to combine these two lines of code for doing it at the same time :
data$name1[(data$name1 == "John" & data$name2 == "Paul")] <- "Jake"
data$name2[(data$name1 == "John" & data$name2 == "Paul")] <- "John"
Should be a simple trick but i dont have it !
Also, i should do that on larger datasets each modification can appear on multiple lines, and i cant know on which rows will be the modification
How about this ?
data[data$col1 == "A" & data$col2 == "B", ] <- list("B", "D")
data
# col1 col2
#1 B D
#2 A C
#3 B A
#4 B B
library(tidyverse)
data %>%
mutate(
name1=
case_when(
name1=="John" & name2=="Paul" ~ "Jake",
TRUE ~ name1
),
name2=
case_when(
name1=="John" & name2=="Paul" ~ "John",
TRUE ~ name2))

How to rewrite loop to run faster in R?

Given a dataset of > 900,000 rows, of which length(duplicates) = >300,000, the following loop takes appr 4h to run in R, which is unacceptable.
for(i in duplicates) {
couple_table <- filter(data, pnr == i) # filter patients
min_date <- min(couple_table$date) # determine date of first operation
max_date <- max(couple_table$date) # determine date of second operation
data$first[data$pnr == i & data$date == min_date] <- 1 # assign 1 to column first
data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}
How can I tweak this code to run faster in R? I have had a look at *apply but I am not familiar with it at all, any ideas?
Dummy data:
data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
date = c(as.Date('2011-12-19'), as.Date('2012-09-11'), as.Date('2013-10-01'),
as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])
A group by operation would be more faster
library(dplyr)
data %>%
group_by(pnr) %>%
mutate(Min = if(n() > 1) NA^(date != min(date)) else NA,
Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
ungroup
-output
# A tibble: 6 x 4
# pnr date Min Max
# <chr> <date> <dbl> <dbl>
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Similar logic in data.table would be
library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1)
NA^(date != min(date)) else NA, if(.N> 1)
NA^(date != max(date)) else NA), .(pnr)]
Or may use collapse for faster execution
library(collapse)
data %>%
ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>%
ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1),
Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
# pnr date Min Max
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Or use base R with duplicated
i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))
With data.table
library(data.table)
setDT(data)
data[pnr %in% duplicates, ":="(
Min = (date == min(date)) * 1L,
Max = (date == max(date)) * 1L
), by = pnr
]
data[, c("Min", "Max") := lapply(.SD, function(x) ifelse(x == 0, NA, x)), .SDcols = c("Min", "Max")]
Here is a base R solution with ave. It uses the trick in akrun's answer, that
NA^0 == 1
(More precisely, that NA^FALSE == NA^0 == 1)
data$first <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == max(d))))
data$second <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == min(d))))
data
# pnr date first second
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
A data.table option
setDT(data)[
,
`:=`(
first = ifelse(min(date) == date & .N > 1, 1, NA_integer_),
second = ifelse(max(date) == date & .N > 1, 1, NA_integer_)
),
pnr
]
gives
pnr date first second
1: a43 2011-12-19 1 NA
2: a4945 2012-09-11 NA NA
3: a43 2013-10-01 NA 1
4: a231 2012-05-09 NA 1
5: a231 2009-09-10 1 NA
6: a6901 2015-06-12 NA NA

Check if column value is in between (range) of two other column values

I have a data frame that looks like this (Dataframe X):
id number found
1 5225 NA
2 2222 NA
3 3121 NA
I have another data frame that looks like this (Dataframe Y):
id number1 number2
1 4000 6000
3 2500 3300
3 7000 8000
What I want to do is this: For each value in the Dataframe X "number" column, search if it is equal to or between ANY of the "number1" and "number2" pair values of Dataframe Y. Additionally, for this "number1" and "number2" pair values, its respective "id" must match the "id" in Dataframe X. If this is all true, then I want to insert a "YES in the "found" column of the respective row in Dataframe X:
id number found
1 5225 YES
2 2222 NA
3 3121 YES
How would I go about doing this? Thanks for the help.
Using tidyverse functions, especially map_chr to iterate over each number:
library(tidyverse)
tbl1 <- read_table2(
"id number found
1 5225 NA
2 2222 NA
3 3121 NA"
)
tbl2 <- read_table2(
"id number1 number2
1 4000 6000
2 2500 3300
3 7000 8000"
)
tbl1 %>%
mutate(found = map_chr(
.x = number,
.f = ~ if_else(
condition = any(.x > tbl2$number1 & .x < tbl2$number2),
true = "YES",
false = NA_character_
)
))
#> # A tibble: 3 x 3
#> id number found
#> <int> <int> <chr>
#> 1 1 5225 YES
#> 2 2 2222 <NA>
#> 3 3 3121 YES
Created on 2018-10-18 by the reprex package (v0.2.0).
Here is an option using fuzzy_join
library(fuzzy_join)
library(dplyr)
fuzzy_left_join(X, Y[-1], by = c("number" = "number1", "number" = "number2"),
match_fun =list(`>=`, `<=`)) %>%
mutate(found = c(NA, "YES")[(!is.na(number1)) + 1]) %>%
select(names(X))
# id number found
#1 1 5225 YES
#2 2 2222 <NA>
#3 3 3121 YES
Or another option is a non-equi join with data.table
library(data.table)
setDT(X)[, found := NULL]
X[Y, found := "YES", on = .(number >= number1, number <= number2)]
X
# id number found
#1: 1 5225 YES
#2: 2 2222 <NA>
#3: 3 3121 YES
data
X <- structure(list(id = 1:3, number = c(5225L, 2222L, 3121L), found = c(NA,
NA, NA)), class = "data.frame", row.names = c(NA, -3L))
Y <- structure(list(id = 1:3, number1 = c(4000L, 2500L, 7000L), number2 = c(6000L,
3300L, 8000L)), class = "data.frame", row.names = c(NA, -3L))
We can loop over each x$number using sapply and check if it lies in range of any of y$number1 and y$number2 and give the value accordingly.
x$found <- ifelse(sapply(x$number, function(p)
any(y$number1 <= p & y$number2 >= p)),"YES", NA)
x
# id number found
#1 1 5225 YES
#2 2 2222 <NA>
#3 3 3121 YES
Using the same logic but with replace
x$found <- replace(x$found,
sapply(x$number, function(p) any(y$number1 <= p & y$number2 >= p)), "YES")
EDIT
If we want to also compare the id value we could do
x$found <- ifelse(sapply(seq_along(x$number), function(i) {
inds <- y$number1 <= x$number[i] & y$number2 >= x$number[i]
any(inds) & (x$id[i] == y$id[which.max(inds)])
}), "YES", NA)
x$found
#[1] "YES" NA "YES"
Using sqldf:
library(sqldf)
sql <- "SELECT DISTINCT x.id, x.number, "
sql <- paste0(sql, "CASE WHEN y.id IS NOT NULL THEN 'YES' END AS found ")
sql <- paste0(sql, "FROM X x LEFT JOIN Y y ON x.number BETWEEN y.number1 AND y.number2")
X <- sqldf(sql)

Filter rows of a dataframe with an equal column value

Let's suppose we have a dataframe like this:
df <- data.frame(v1=c("aa", "aa", "b", "cc", "cc"), V2=c("yes", "yes", "no", "yes", "no"))
> df
six seven
1 aa yes
2 aa yes
3 b no
4 cc yes
5 cc no
I want to filter and, then, store in a new dataframe rows that matches 2 cryteria: same "six" column value and a specific "seven" column value. For example, let's suppose we want rows with "yes" column:
> df
six seven
1 aa yes
2 aa yes
How can I do this? I've tried with:
df_new <- filter(df, ...)
But I'm sure sure how to impose both the conditions.
and:
require(plyr)
ans = ddply(df, .(seven == "yes"), mutate, count = length(unique(six)))
Who gives:
> ans
seven == "yes" six seven count
1 FALSE b no 2
2 FALSE cc no 2
3 FALSE cc no 2
4 TRUE aa yes 1
5 TRUE aa yes 1
But this doesn't filter the dataframe.
EDIT: To clarify, if I have more columns in the dataframe, like this:
df <- data.frame(v1=c("aa", "aa", "b", "cc", "cc","aa","aa"), v2=c("yes", "yes", "no", "yes", "no","no","yes"))
> df
v1 v2
1 aa yes
2 aa yes
3 b no
4 cc yes
5 cc no
6 aa no
7 aa yes
The code has to give this:
df
six seven
1 aa yes
2 aa yes
7 aa yes
Ok, finally I had it. I leave here the solution for those who want to know:
types <- unique(df$six)
tmp = list()
require(dplyr)
for (k in 1:length(types)) {
tmp[[k]] <- df %>% filter(six == types[k] & seven == "yes")
}
ls <- Filter(function(x) nrow(x) > 1, tmp)
A bit tricky, maybe, but works. Of course, you have to extract a dataframe from the list in the end. If someone have a better idea, post it. If you're wondering why I'm using list, working with only dataframes gave me some problems.
Here is an idea via ddplyr. First group by v1 and add the 2 criteria in filter. The group needs to be larger than 2 so to infer that v1 values are the same, and of course v2 == 'yes' is self explanatory,
library(tidyverse)
df %>%
group_by(v1) %>%
filter(n() >= 2 & all(v2 == 'yes'))
which gives,
# A tibble: 2 x 2
# Groups: v1 [1]
v1 v2
<fct> <fct>
1 aa yes
2 aa yes

aggregate command in R

How can I use aggregate command for converting this table:
name ID
a 1
a 2
a 2
a NA
b NA
c NA
c NA
to this one:
name ID
a 1|2
b NA
c NA
Thanks.
In base:
> aggregate(ID ~ name, data=x, FUN=function(y) paste(unique(y),
collapse='|'),na.action=na.pass)
name ID
1 a 1|2|NA
2 b NA
3 c NA
This differs from your specification in the handling of the fourth row.
You can try:
library(tidyr);
df$name <- as.factor(df$name)
aggregate(ID ~ name, unique(df[complete.cases(df),]), paste, collapse = "|") %>%
complete(name)
Source: local data frame [3 x 2]
name ID
(fctr) (chr)
1 a 1|2
2 b NA
3 c NA
The logic here is filtering out all incomplete rows and duplicated rows firstly, paste the ID together and then use the complete function from tidyr package to automatically fill the factor variable with all the levels to make sure no information is missing.
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'name', if the elements in 'ID' are all NA, then we return the NA or else paste the unique elements that are not NA in the 'ID' column.
library(data.table)
setDT(df1)[,.(ID= if(all(is.na(ID))) NA_character_ else
paste(na.omit(unique(ID)), collapse = "|")), by = name]
# name ID
#1: a 1|2
#2: b NA
#3: c NA
The same methodology can be used in dplyr
library(dplyr)
df1 %>%
group_by(name) %>%
summarise(ID = if(all(is.na(ID))) NA_character_
else paste(unique(ID[!is.na(ID)]), collapse="|"))
# name ID
# <chr> <chr>
#1 a 1|2
#2 b <NA>
#3 c <NA>

Resources