Matching columns in 2 data frames when numbers don't exactly match - r

How do I match two different data frames when the values I am comparing are not exactly the same?
I was thinking of using merge() but I am not sure.
Table1:
ID Value.1
10001 x
18273-9 y
12824/5/6/7 z
10283/5/9 d
Table2:
ID Value.2
10001 a
18274 b
12826 c
10289 u
How do I merge Table 1 and 2 based on ID?
Which specific function of fuzzyjoin package would I use, especially with the "/" & "-" cases? How do I expand the "-" case from 18273-9 so that R will register 18273 / 18274 / 18275 / ...?

You can write a function to extract the corresponding sequences from the strings containing "/" or "-" and recombine them into a new data.frame as follows:
df1 <- data.frame(ID=c("10001","18273-9","15273-8", "15170-4", "12824/5/6/7","10283/5/9"),
value=c("a","c","c", "d","k", "l"), stringsAsFactors = F)
df2 <- data.frame(ID=c("10001","18274","12826","10289"),
value=c("o","p","q","r"), stringsAsFactors = F)
doIt <- function(df){
listAsDF <- function(l) {
x <- stack(setNames(l, temp$value))
names(x) <- c("ID", "value")
return(x)
}
Base <- df[!grepl("\\/", df$ID) & !grepl("\\-", df$ID), ]
#1 cases when - present
temp <- df[grep("\\-", df$ID),]
temp <- listAsDF(lapply(strsplit(temp$ID, "-"), function(e) seq(e[1], paste0(strtrim(e[1], nchar(e[1])-1), e[2]), 1)))
Base <- rbind(Base, temp)
#2 cases when / present
temp <- df[grep("\\/", df$ID),]
temp <- listAsDF(lapply(strsplit(temp$ID, "/"), function(a) c(a[1], paste0(strtrim(a[1], nchar(a[1])-1), a[-1]))))
Base <- rbind(Base, temp)
return(Base)
}
Then you can mergge the df2 and df1:
merge(doIt(df1), df2, by = "ID", all.x = T)
Hope this helps!

You could use the fuzzy string matching function "agrep" from base R.
df1 <- data.frame(ID=c("10001","18273-9","12824/5/6/7","10283/5/9"),
value=c("a","c","d","k"))
df2 <- data.frame(ID=c("10001","18274","12826","10289"),
value=c("o","p","q","r"))
apply(df1, 1, function(x) agrep(x["ID"], df2$ID, max = 3.5))
As you see it struggles to find the match for row 4. So it might make sense to clean your ID variable (e.g., take out the "/") before running agrep.

One option could consist in extracting the format of ID you want to keep. And then do your merge.
You can format your ID column as follow :
library(stringr)
library(dplyr)
If you want only the digits before any symbols
Table1 %>% mutate(ID = str_extract("[0-9]*"))
If you want to keep the first sequence of 5 digits
Table1 %>% mutate(ID = str_extract("[0-9]{5}"))
This answers your second question, but does not use the fuzzyjoin package

Related

How do I get the column number from a dataframe which contains specific strings?

I have a data frame df with 7 columns and I have a list z containing multiple strings.
I want a dataframe containing only the columns in df which contain the sting from z.
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
How do I get the column number of the z strings in df? Or how do I get a dataframe with only the columns which contains the z strings.
What I want is:
print(df)
"a_means" "c_m" "f_m"
What I tried:
match(a, names(df)
and
df[,which(colnames(df) %in% colnames(df[ ,grepl(z,names(df)])]
You can use:
df[,match(z, substring(colnames(df), 1, 3))]
With base R:
z <- paste(z, collapse = "|")
df[, grepl(z, names(df))] # you could use grep as well
Combine the search patterns and use that as a pattern for stringr::str_detect() function.
library(dplyr)
library(stringr)
df <- data.frame(a_means = "a_means",
b_means = "b_means",
c_means = "c_means",
d_means = "d_means",
e_means = "e_means",
f_means = "f_means",
g_means = "g_means"
)
z <- c("a_m","c_m","f_m")
z <- paste(z, collapse = "|")
df %>% select_if(str_detect(names(df), z))
#> a_means c_means f_means
#> 1 a_means c_means f_means
You can simply do this:
library(dplyr)
df %>%
select(contains(z))
Check out help("starts_with"). You can also match to a starting prefix with starts_with() among other things.
You can use select and matches to subest the columns based on z
library(dplyr)
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
df %>%
select(matches(z))
#> X.a_means. X.c_means. X.f_means.
#> 1 a_means c_means f_means

Flipping two sides of string

I need to prepare a certain dataset for analysis. What I have is a table with column names (obviously). The column names are as follows (sample colnames):
"X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM"
(this is a vector, for those not familiair with R colnames() function)
Now, what I want is simply to flip the values in front of, and after the underscore. e.g. X99_NORM becomes NORM_X99. Note that I want this only for the column names which contain NORM in their name.
Some other base R options
1)
Use sub to switch the beginning and end - we can make use of capturing groups here.
x <- sub(pattern = "(^X\\d+)_(NORM$)", replacement = "\\2_\\1", x = x)
Result
x
# [1] "NORM_X99" "NORM_X101" "X76_110_T02_09747" "NORM_X30"
2)
A regex-free approach that might be more efficient using chartr, dirname and paste. But we need to get the indices of the columns that contain "NORM" first
idx <- grep(x = x, pattern = "NORM", fixed = TRUE)
x[idx] <- paste0("NORM_", dirname(chartr("_", "/", x[idx])))
x
data
x <- c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM")
x = c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM")
replace(x,
grepl("NORM", x),
sapply(strsplit(x[grepl("NORM", x)], "_"), function(x){
paste(rev(x), collapse = "_")
}))
#[1] "NORM_X99" "NORM_X101" "X76_110_T02_09747" "NORM_X30"
A tidyverse solution with stringr:
library(tidyverse)
library(stringr)
my_data <- tibble(column = c("X99_NORM", "X101_NORM", "X76_110_T02_09747", "X30_NORM"))
my_data %>%
filter(str_detect(column, "NORM")) %>%
mutate(column_2 = paste0("NORM", "_", str_extract(column, ".+(?=_)"))) %>%
select(column_2)
# A tibble: 3 x 1
column_2
<chr>
1 NORM_X99
2 NORM_X101
3 NORM_X30

R - Append rows from dataframe to another one without duplicate on "primary keys columns"

I have two dataframes (A and B). B contains new values and A contains outdated values.
Each of these dataframes have one column representing the key and another one representing the value.
I want to add rows from B to A and then clean rows that contain duplicated keys from A (update A with the new values that are in B). Order doesn't really matter, I think it is easier in the other order : cleaning duplicates and then appending.
At the moment, I have done this script :
A <- bind_rows(B, A)
A <- A[!duplicated(A),]
The issue I have is that it doesn't clean rows because they are not real duplicates (value is different).
How could I handle this?
This is just a hunch because there's no example data provided, but I suspect a merge is a much safer approach than a row-bind:
Solution with data.table
library(data.table)
1 - Rename variables to prepare for a merge
setnames(A, old="value", new="value_A")
setnames(B, old="value", new="value_B")
2 - Merge, be sure to use the all arg
dt <- merge(A, B, by="key", all=TRUE)
3 - Use some rule for the update - for example: use value_B unless it's missing, in which case use value_A
dt[ , value := value_B]
dt[is.na(value), value := value_A]
Solution with Base R
names(A) <- c("key", "value_A")
names(B) <- c("key", "value_B")
df <- merge(A, B, by="key", all=TRUE)
df$value <- df$value_B
df[is.na(df$value), "value"] <- df[is.na(df$value), "value_A"]
Solution with dplyr/tidyverse
library(dplyr)
df <- full_join(A, B, by="key") %>%
mutate(value = ifelse(is.na(value_B), value_A, value_B))
Example Data
set.seed(1234)
A <- data.frame(
key = sample(1:50, size=20),
value = runif(20, 1, 10))
B <- data.frame(
key = sample(1:50, size=20),
value = runif(20, 1, 10))

In R, how can I copy rows from one dataframe to another when the df being copied to has 2 additional columns?

I have a tab delimited text file with 12 columns that I am uploading to my program. I go on to create another dataframe with a structure similar to the one uploaded and add 2 more columns to it.
excelfile = read.delim(ExcelPath)
matchedPictures<- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
Now I have a function in which I do the following:
Based on a condition, I obtain the row number pictureMatchNum of the row I need to copy from excelfile to matchedPictures.
I should then copy the row from excelfile to matchedPictures. I tried a couple of different ways so far.
a.
rowNumber = nrow(matchedPictures) + 1
matchedPictures[rowNumber,1:12] <<- excelfile[pictureMatchNum,1:12]
b.
matchedPictures[rowNumber,1:12] <<- rbind(matchedPictures, excelfile[pictureWordMatches,1:12], make.row.names = FALSE)
2a. doesn't seem to work because it copies the indices from the excelfileand uses them as row names in the matchedPictures - which is why I decided to go with rbind
2b. doesn't seem to work because rbind needs to have the columns be identical and matchedPictureshas 2 extra columns.
EDIT START - Including reproducible example.
Here is some reproducible code (with fewer columns and fake data)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)] )
matchedPictures <- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
pictureMatchNum1 = match(1, str_detect("A", regex(excelfile$x, ignore_case = TRUE)))
rowNumber1 = nrow(matchedPictures) + 1
pictureMatchNum2 = match(1, str_detect("D", regex(excelfile$x, ignore_case = TRUE)))
rowNumber2 = nrow(matchedPictures) + 1
The 2 options I tried are
2a.
matchedPictures[rowNumber1,1:3] <<- excelfile[pictureMatchNum1,1:3]
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- excelfile[pictureMatchNum2,1:3]
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
OR
2b.
matchedPictures[rowNumber1,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum1,1:3], make.row.names = FALSE)
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum2,1:3], make.row.names = FALSE)
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
EDIT END
Additionally, I have also seen the suggestions in many places that rather than using empty dataframes, one should have vectors and append data to the vectors and then combine them into a dataframe. Is this suggestion valid when I have so many columns and would need to have 14 separate vectors and copy each one of them individually?
What can I do to make this work?
You could
first determine the row indices of excelfile that match your criteria
extract these rows
then generate the data to fill your columns beforeName and afterName
then append these columns to your new data frame
Example:
excelfile <- data.frame(x = letters, y = words[length(letters)],
z = fruit[length(letters)])
## Vector of patterns:
patternVec <- c("A", "D", "M")
## Look for appropriate rows in file 'excelfile':
indexVec <- vapply(patternVec,
function(myPattern) which(str_detect(myPattern,
regex(excelfile$x, ignore_case = TRUE))), integer(1))
## Extract these rows:
matchedPictures <- excelfile[indexVec,]
## Somehow generate the data for columns 'beforeName' and 'afterName':
## I do not know how this information is generated so I just insert
## some dummy code here:
beforeNameVec <- c("xxx", "uuu", "mmm")
afterNameVec <- c("yyy", "www", "nnn")
## Then assign these variables:
matchedPictures$beforeName <- beforeNameVec
matchedPictures$afterName <- afterNameVec
matchedPictures
# x y z beforeName afterName
# a air dragonfruit xxx yyy
# d air dragonfruit uuu www
# m air dragonfruit mmm nnn
You can make this much simpler by using dplyr
library(dplyr)
library(stringr)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)],
stringsAsFactors = FALSE ) #add stringsAsFactors to have character columns
pictureMatch <- excelfile %>%
#create a match column
mutate(match = ifelse(str_detect(x,"a") | str_detect(x,'d'),1,0)) %>%
#filter to only the columns that match your condition
filter(match ==1)
pictureMatch <- pictureMatch[['x']] #convert to a vector
matchedPictures <- excelfile %>%
filter(x %in% pictureMatch) %>% #grab the rows that match your condition
mutate(beforeName = c('xxx','uuu'), #add your names
afterName = c('yyy','www'))

Big tasks In R, how to avoid for loops to run faster

My code is running but very very slowly. So this is a big problem and it has to run quicker. So here is the task:
I have a dataset with telecommunication records and i want to apply multiple functions on all records to each customer and put the results in a another data frame.
So df1 is the data frame where each row has a unique customer id and columns with some profil infomations. df2 is a very big data frame with about 800 000 telecommunications records identifyed over the customer ids. Now i want to compute e.g. the average data usage for each customer in df2 and save the result in df1.
df1 looks like
df1 <- read.table(header = TRUE, sep=",",
text="CUSTOMER_ID,Age,ContractType, Gender
ID1,45,Postpaid,m
ID2,50,Postpaid,f
ID3,35,Postpaid,f
ID4,44,Postpaid,m
ID5,32,Postpaid,m
ID6,48,Postpaid,f
ID7,50,Postpaid,m
ID8,51,Postpaid,f")
df2 looks like
df2 <- read.table(header = TRUE, sep=",",
text="CUSTOMER_ID,EVENT,VOLUME, DURATION, MONTH
ID1,100,500,200,201505
ID1,50,400,150,201506
ID1,80,600,50,201507
ID2,40,800,45,201505
ID2,25,650,120,201506
ID2,65,380,250,201507
ID3,30,950,110,201505
ID3,25,630,85,201506
ID3,15,780,60,201507")
My codes is like
USAGE <- c("EVENT", "VOLUME", "DURATION") #column names of df2
list of functions i want to apply on df2
StatFunctions <- list(
max = function(x) max(x),
mean = function(x) mean(x),
sum = function(x) sum(x)
)
In my original data set the Customer IDs are more complex so i choose this pattern search for the cutsomer ids. This is only a cut out of my code. But with the rest it is the same problem with the for loops.
func.num <- function(prefix, target.df, n) {
active.df <- get(target.df)
return(StatFunctions[[n]](active.df[grep(pattern = prefix,
x = active.df$CUSTOMER_ID), USAGE[m]]))
}
for (x in df1$CUSTOMER_ID) {
for (m in 1:length(USAGE)) {
for (n in 1:length(StatFunctions)) {
df1[df1$CUSTOMER_ID == x, paste(names(StatFunctions[n]),
USAGE[m], sep = "_")] <- func.num(prefix = x, target.df = "df2",n)
}
}
}
I know the code is very complicated and should be simplified.
And i want a data frame like this
Customer_ID Age contractType Gender max_EVENT mean_EVENT sum_EVENT ... sum_DURATION
ID1 45 Postpaid m 100 76 230 ... 400
So how can i avoid the for loops to run faster?
I would use dplyr package to summarize df2 by customer ID, then merge with df1.
df1 <- read.table(header = TRUE, sep=",",
text="CUSTOMER_ID,Age,ContractType, Gender
ID1,45,Postpaid,m
ID2,50,Postpaid,f
ID3,35,Postpaid,f
ID4,44,Postpaid,m
ID5,32,Postpaid,m
ID6,48,Postpaid,f
ID7,50,Postpaid,m
ID8,51,Postpaid,f")
df2 <- read.table(header = TRUE, sep=",",
text="CUSTOMER_ID,EVENT,VOLUME, DURATION, MONTH
ID1,100,500,200,201505
ID1,50,400,150,201506
ID1,80,600,50,201507
ID2,40,800,45,201505
ID2,25,650,120,201506
ID2,65,380,250,201507
ID3,30,950,110,201505
ID3,25,630,85,201506
ID3,15,780,60,201507")
df1$CUSTOMER_ID <- gsub(" ", "", df1$CUSTOMER_ID)
df2$CUSTOMER_ID <- gsub(" ", "", df2$CUSTOMER_ID)
library(dplyr)
USAGE <- c("EVENT", "VOLUME", "DURATION")
FUNC <- c("max", "mean", "sum")
dots <- lapply(USAGE, function(u) sprintf("%s(%s)", FUNC, u)) %>% unlist()
dots <- setNames(dots, sub("\\)", "", sub("\\(", "_", dots)))
sum_df <- df2 %>% group_by(CUSTOMER_ID) %>%
summarize_(.dots = dots) %>%
ungroup()
df1$CUSTOMER_ID <- as.character(df1$CUSTOMER_ID)
sum_df$CUSTOMER_ID <- as.character(sum_df$CUSTOMER_ID)
df1 <- left_join(df1, sum_df)
First we fetch the columns that are to be operated on and the ID's
mycols <- c("EVENT","VOLUME","DURATION")
id <- levels(df2$CUSTOMER_ID)
We are going to do this by using the (much faster) apply functions, that will allow us to do the operations parallel on each column, instead of one by one. Create a function that takes such operation on each of the columns. This we will apply over each ID.
For taking mean and summing, we may use the (very fast) colMeans and colSums.
applyfun <- function(i,FUN){
FUN(df2[df2$CUSTOMER_ID == i,mycols])
}
For maximum, we create a similar function
colMax <- function (colData) {
apply(colData, MARGIN=c(2), max)
}
Apply the three functions
outmean <- sapply(id,applyfun,colMeans)
outsum <- sapply(id,applyfun,colSums)
outmax <- sapply(id,applyfun,colMax)
out <- data.frame(CUSTOMER_ID = rownames(t(outmean)),
mean = t(outmean),
sum = t(outsum),
max = t(outmax))
Merge the data onto df1
merge(df1,out,key = "CUSTOMER_ID",all.x = TRUE)
which gives the output:
CUSTOMER_ID Age ContractType Gender mean.EVENT ... max.DURATION
1 ID1 45 Postpaid m 76.66667 ... 200
2 ID2 50 Postpaid f 43.33333 ... 250
3 ID3 35 Postpaid f 23.33333 ... 110
4 ID4 44 Postpaid m NA ... NA
I had some whitespace problems with the CUSTOMER_ID from your examples of df1 and df2 and suppose you do not. To fix this I used
df1$CUSTOMER_ID <- as.factor(trimws(df1$CUSTOMER_ID))
df2$CUSTOMER_ID <- as.factor(trimws(df2$CUSTOMER_ID))

Resources