Related
I have two data frames (e.g. df and df1) which have several columns. One of the columns is the ID one, that I would use to merge the dfs.
for examples:
df$ID <- c("AB/XX/1/1/364/93/DA/3", "AT/PML/1/1/364/94/DA/3", "AT/PRT/1/1/364/95/DA/3", "AK/PRT/1/1/364/115/DA/3", "AB/XX/1/1/365/116/DA/3", "AB/XX/1/1/365/117/DA/3", "AT/PML/1/1/365/118/DA/3", NA, NA, "AK/PRT/1/1/1/149/DA/3", "AB/XX/1/1/1/151/DA/3", "AT/PML/1/1/2/152/DA/3", "AK/PRT/1/1/2/153/DA/3")
df1$ID <- c("AT/FBA/1/12/360/26/SF/96", "AT/RLMW/1/12/360/44/SF/122", "AT/ACR/1/12/362/66/SF/175", "AT/AA/1/12/363/72/SF/281", "AT/BB/1/12/364/90/SF/310", "AT/ANT/1/123/364/92/SF/338")
I want to merge some columns of df1 to df according to a code text that is contained within the ID argument.
The code text is inside the ID code, and it is composed by the 5th and 6th arguments separated by '/'. For example, "360/26", "360/44", etc.
(How to extract this code has been asked and replied here:R - Extract text between symbol or delimiter '/').
I tried to make a for loop, using grepl, but with bad results:
str1_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df1$ID)
#check if the df$ID contains str1_sub
grepl(str1_sub, df$ID, fixed = TRUE)
#create new empty columns in df that will be filled using df1's columns
df$banana <- NA
df$apple <- NA
#for loop to fill the columns
for (n in length(str1_sub)){
for (m in length(df$ID)){
if (grepl(str1_sub[n], df$ID[m], fixed = TRUE) == TRUE) {
df$banana[m] <- df1$BANANA[n]
df$apple[m] <- df1$APPLE[n]
}
}
}
The if condition does not work, saying that it misses the TRUE/FALSE value
I am sure there is a better way than a for loop.
You could create an ID_sub field and use a left_join.
I modified first ID value to get at least a correspondance.
library(dplyr)
df <- data.frame(ID = c("AB/XX/1/1/364/93/DA/3", "AT/PML/1/1/364/94/DA/3", "AT/PRT/1/1/364/95/DA/3", "AK/PRT/1/1/364/115/DA/3", "AB/XX/1/1/365/116/DA/3", "AB/XX/1/1/365/117/DA/3", "AT/PML/1/1/365/118/DA/3", NA, NA, "AK/PRT/1/1/1/149/DA/3", "AB/XX/1/1/1/151/DA/3", "AT/PML/1/1/2/152/DA/3", "AK/PRT/1/1/2/153/DA/3"))
df1 <- data.frame(ID = c("AT/FBA/1/12/364/93/SF/96", "AT/RLMW/1/12/360/44/SF/122", "AT/ACR/1/12/362/66/SF/175", "AT/AA/1/12/363/72/SF/281", "AT/BB/1/12/364/90/SF/310", "AT/ANT/1/123/364/92/SF/338"))
df1$ID_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df1$ID)
df$ID_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df$ID)
df$banana <- "banana"
df1$BANANA <-"banana1"
left_join(df,df1,by="ID_sub") %>% mutate(banana = if_else(!is.na(ID.y),BANANA,banana)) %>%
select(ID = ID.x, banana)
#> ID banana
#> 1 AB/XX/1/1/364/93/DA/3 banana1
#> 2 AT/PML/1/1/364/94/DA/3 banana
#> 3 AT/PRT/1/1/364/95/DA/3 banana
#> 4 AK/PRT/1/1/364/115/DA/3 banana
#> 5 AB/XX/1/1/365/116/DA/3 banana
#> 6 AB/XX/1/1/365/117/DA/3 banana
#> 7 AT/PML/1/1/365/118/DA/3 banana
#> 8 <NA> banana
#> 9 <NA> banana
#> 10 AK/PRT/1/1/1/149/DA/3 banana
#> 11 AB/XX/1/1/1/151/DA/3 banana
#> 12 AT/PML/1/1/2/152/DA/3 banana
#> 13 AK/PRT/1/1/2/153/DA/3 banana
I would like to join the two data frames :
a <- data.frame(x=c(1,3,5))
b <- data.frame(start=c(0,4),end=c(2,6),y=c("a","b"))
with a condition like (x>start)&(x<end) in order to get such a result:
# x y
#1 1 a
#2 2 <NA>
#3 3 b
I don't want to make a potentially large cartesian product and then select only the few rows matching the condition and I'd like a solution using the tidyverse (I am not interested in a solution using SQL which would be a confession of failure). I thought of the 'fuzzyjoin' package but I cannot find examples fitting my need : the function to apply for the condition has only two arguments. I also tried to put 'start' and 'end' into a single argument with data.frame(z=I(purrr::map2(b$start,b$end,list)),y=b$y)
# z y
#1 0, 2 a
#2 4, 6 b
but although the data looks fine fuzzy_left_join doesn't accept it.
I search for solutions working in more general cases (n variables on the LHS, m on the RHS, not necessarily numeric with arbitrary conditions).
UPDATE
I also want to be able to express conditions like (x=start+1)|(x=end+1) giving here:
# x y
#1 1 a
#2 3 a
#3 5 b
For this case you don't need multi_by or multy_match_fun, this works :
library(fuzzyjoin)
fuzzy_left_join(a, b, by = c(x = "start", x = "end"), match_fun = list(`>`, `<`))
# x start end y
# 1 1 0 2 a
# 2 3 NA NA <NA>
# 3 5 4 6 b
I eventually went to the code of fuzzy_join and found a way to make what I want even without proper documentation. fuzzy_let_join doesn't work but there is the following way (not really pretty and it actually does a cartesian product):
g <- function(x,y) (x>y[,"start"])&(x<y[,"end"])
fuzzy_join(a,b, multi_by = list(x="x",y=c("start","end"))
, multi_match_fun = g, mode = "left") %>% select(x,y)
data.table approach could be
library(data.table)
name1 <- setdiff(names(setDT(b)), names(setDT(a)))
#perform left outer join and then select required columns
a[b, (name1) := mget(name1), on = .(x > start, x < end)][, .(x, y)]
which gives
x y
1: 1 a
2: 3 <NA>
3: 5 b
Sample data:
a <- data.frame(x = c(1, 3, 5))
b <- data.frame(start = c(0, 4), end = c(2, 6), y = c("a", "b"))
Update: In case you want to join both dataframes on (x=start+1)|(x=end+1) condition then you can try
library(data.table)
DT1 <- as.data.table(a)
DT2 <- as.data.table(b)
#Perform 1st join on "x = start+1" and then another on "x = end+1". Finally row-bind both results.
DT <- rbindlist(list(DT1[DT2[, start_temp := start+1], on = c(x = "start_temp"), .(x, y), nomatch = 0],
DT1[DT2[, end_temp := end+1], on = c(x = "end_temp"), .(x, y), nomatch = 0]))
DT
# x y
#1: 1 a
#2: 5 b
#3: 3 a
A possible answer to explain what I am trying to do : extending dplyr in some way. And I will be happy to know if there are ways to improve this solution or some problems I didn't see.
The solution avoids the cartesian product, but duplicates into lists of data frames both one of the input data frame and the result. I didn't include the final column selection of x and y that is easy to code.
my_left_join <- function(.DATA1,.DATA2,.WHERE)
{
call = as.list(match.call())
df1 <- .DATA1
df1$._row_ <- 1:nrow(df1)
dfl1 <- replyr::replyr_split(df1,"._row_")
eval(substitute(
dfl2 <- mapply(function(.x)
{filter(.DATA2,with(.x,WHERE)) %>%
mutate(._row_=.x$._row_)}
, dfl1, SIMPLIFY=FALSE)
,list(WHERE=call$.WHERE)))
df2 <- replyr::replyr_bind_rows(dfl2)
left_join(df1,df2,by="._row_") %>% select(-._row_)
}
my_left_join(a,b,(x>start)&(x<end))
# x start end y
#1 1 0 2 a
#2 3 NA NA <NA>
#3 5 4 6 b
my_left_join(a,b,(x==(start+1))|(x==(end+1)))
# x start end y
#1 1 0 2 a
#2 3 0 2 a
#3 5 4 6 b
You can try a GenomicRanges solution
library(GenomicRanges)
# setup GRanges objects
a_gr <- GRanges(1, IRanges(a$x,a$x))
b_gr <- GRanges(1, IRanges(b$start, b$end))
# find overlaps between the two data sets
res <- as.data.frame(findOverlaps(a_gr,b_gr))
# create the expected output
a$y <- NA
a$y[res$queryHits] <- as.character(b$y)[res$subjectHits]
a
x y
1 1 a
2 3 <NA>
3 5 b
I need to associate the group to 20k groups which total amounts to 12M rows.
To solve this problem I wrote a for loop but it is clearly totally inefficient and I am sure this task can be easily vectorized. However, I am struggling in understanding how to write this instruction in a vectorized fashion.
The problem is the following:
I have an auxiliary_table with 3 features: ID, start_row, end_Row.
start_row is the row index of the first element in my_DF belonging to ID x;
end_row is the row index of the last element in my_DF belonging to ID x.
The vectorized instruction should do the following:
Considering the auxiliary_table like the following:
auxiliary_table <- data.frame(ID = c(1,2,3,4), start_row = c(1,4,8,13), end_row = c(3,7,12,14))
Considering a DF like the following:
my_df <- data.frame(Var_a = c(1,2,3,1,2,3,4,6,4,3,1,2,1,1)
We need to associate the ID based on the start_row and end_row index information contained in the auxiliary_table.
The solution_df is:
solution_df <- data.frame(my_df, ID=(1,1,1,2,2,2,2,3,3,3,3,3,4,4)
I asked for a vectorization of the for loop but I am open for example to data.table solutions.
I hope I was clear and presented my question correctly.
The auxiliary_table is kind of run-length encoded. Therefore, I suggest to try the inverse.rle() function with an appropriately transformed auxiliary_table:
1. dplyr
library(dplyr)
my_df %>%
mutate(ID = auxiliary_table %>%
transmute(lengths = end_row - start_row + 1L, values = ID) %>%
inverse.rle())
Var_a ID
1 1 1
2 2 1
3 3 1
4 1 2
5 2 2
6 3 2
7 4 2
8 6 3
9 4 3
10 3 3
11 1 3
12 2 3
13 1 4
14 1 4
2. data.table
This adds the ID column without copying my_df.
library(data.table)
setDT(my_df)[, ID := inverse.rle(setDT(auxiliary_table)[
, .(lengths = end_row - start_row + 1L, values = ID)])][]
Depending on the size of auxiliary_table the code below might be somewhat more efficient because it transforms auxiliary_table in place:
setDT(my_df)[, ID := inverse.rle(setDT(auxiliary_table)[
, lengths := end_row - start_row + 1L][
, c("end_row", "start_row") := NULL][
, setnames(.SD, "ID", "values")])][]
I have designed a user defined function and applying it on the auxillary_table. See if this helps -
auxiliary_table <- data.frame(ID = c(1,2,3,4), start_row = c(1,4,8,13), end_row = c(3,7,12,14))
my_df <- data.frame(Var_a = c(1,2,3,1,2,3,4,6,4,3,1,2,1,1))
solution_df <- data.frame(my_df, ID=c(1,1,1,2,2,2,2,3,3,3,3,3,4,4))
aux_to_df <- function(aux_row){
# 1,2,3 can be replaced by column names
value = aux_row[1]
start_row = aux_row[2]
end_row = aux_row[3]
my_df[start_row:end_row, "ID"] <<- value # <<- means assigning to global out of scope variable
}
apply(auxiliary_table, 1, aux_to_df)
my_df
I have a dataset about 105000 rows and 30 columns. I have a categorical variable that I would like to assign it to a number. In Excel, I would probably do something with VLOOKUP and fill.
How would I go about doing the same thing in R?
Essentially, what I have is a HouseType variable, and I need to calculate the HouseTypeNo. Here are some sample data:
HouseType HouseTypeNo
Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3
If I understand your question correctly, here are four methods to do the equivalent of Excel's VLOOKUP and fill down using R:
# load sample data from Q
hous <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="HouseType HouseTypeNo
Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
# create a toy large table with a 'HouseType' column
# but no 'HouseTypeNo' column (yet)
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
# create a lookup table to get the numbers to fill
# the large table
lookup <- unique(hous)
HouseType HouseTypeNo
1 Semi 1
2 Single 2
3 Row 3
5 Apartment 4
Here are four methods to fill the HouseTypeNo in the largetable using the values in the lookup table:
First with merge in base:
# 1. using base
base1 <- (merge(lookup, largetable, by = 'HouseType'))
A second method with named vectors in base:
# 2. using base and a named vector
housenames <- as.numeric(1:length(unique(hous$HouseType)))
names(housenames) <- unique(hous$HouseType)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
Third, using the plyr package:
# 3. using the plyr package
library(plyr)
plyr1 <- join(largetable, lookup, by = "HouseType")
Fourth, using the sqldf package
# 4. using the sqldf package
library(sqldf)
sqldf1 <- sqldf("SELECT largetable.HouseType, lookup.HouseTypeNo
FROM largetable
INNER JOIN lookup
ON largetable.HouseType = lookup.HouseType")
If it's possible that some house types in largetable do not exist in lookup then a left join would be used:
sqldf("select * from largetable left join lookup using (HouseType)")
Corresponding changes to the other solutions would be needed too.
Is that what you wanted to do? Let me know which method you like and I'll add commentary.
I think you can also use match():
largetable$HouseTypeNo <- with(lookup,
HouseTypeNo[match(largetable$HouseType,
HouseType)])
This still works if I scramble the order of lookup.
I also like using qdapTools::lookup or shorthand binary operator %l%. It works identically to an Excel vlookup, but it accepts name arguments opposed to column numbers
## Replicate Ben's data:
hous <- structure(list(HouseType = c("Semi", "Single", "Row", "Single",
"Apartment", "Apartment", "Row"), HouseTypeNo = c(1L, 2L, 3L,
2L, 4L, 4L, 3L)), .Names = c("HouseType", "HouseTypeNo"),
class = "data.frame", row.names = c(NA, -7L))
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType),
1000, replace = TRUE)), stringsAsFactors = FALSE)
## It's this simple:
library(qdapTools)
largetable[, 1] %l% hous
The poster didn't ask about looking up values if exact=FALSE, but I'm adding this as an answer for my own reference and possibly others.
If you're looking up categorical values, use the other answers.
Excel's vlookup also allows you to match match approximately for numeric values with the 4th argument(1) match=TRUE. I think of match=TRUE like looking up values on a thermometer. The default value is FALSE, which is perfect for categorical values.
If you want to match approximately (perform a lookup), R has a function called findInterval, which (as the name implies) will find the interval / bin that contains your continuous numeric value.
However, let's say that you want to findInterval for several values. You could write a loop or use an apply function. However, I've found it more efficient to take a DIY vectorized approach.
Let's say that you have a grid of values indexed by x and y:
grid <- list(x = c(-87.727, -87.723, -87.719, -87.715, -87.711),
y = c(41.836, 41.839, 41.843, 41.847, 41.851),
z = (matrix(data = c(-3.428, -3.722, -3.061, -2.554, -2.362,
-3.034, -3.925, -3.639, -3.357, -3.283,
-0.152, -1.688, -2.765, -3.084, -2.742,
1.973, 1.193, -0.354, -1.682, -1.803,
0.998, 2.863, 3.224, 1.541, -0.044),
nrow = 5, ncol = 5)))
and you have some values you want to look up by x and y:
df <- data.frame(x = c(-87.723, -87.712, -87.726, -87.719, -87.722, -87.722),
y = c(41.84, 41.842, 41.844, 41.849, 41.838, 41.842),
id = c("a", "b", "c", "d", "e", "f")
Here is the example visualized:
contour(grid)
points(df$x, df$y, pch=df$id, col="blue", cex=1.2)
You can find the x intervals and y intervals with this type of formula:
xrng <- range(grid$x)
xbins <- length(grid$x) -1
yrng <- range(grid$y)
ybins <- length(grid$y) -1
df$ix <- trunc( (df$x - min(xrng)) / diff(xrng) * (xbins)) + 1
df$iy <- trunc( (df$y - min(yrng)) / diff(yrng) * (ybins)) + 1
You could take it one step further and perform a (simplistic) interpolation on the z values in grid like this:
df$z <- with(df, (grid$z[cbind(ix, iy)] +
grid$z[cbind(ix + 1, iy)] +
grid$z[cbind(ix, iy + 1)] +
grid$z[cbind(ix + 1, iy + 1)]) / 4)
Which gives you these values:
contour(grid, xlim = range(c(grid$x, df$x)), ylim = range(c(grid$y, df$y)))
points(df$x, df$y, pch=df$id, col="blue", cex=1.2)
text(df$x + .001, df$y, lab=round(df$z, 2), col="blue", cex=1)
df
# x y id ix iy z
# 1 -87.723 41.840 a 2 2 -3.00425
# 2 -87.712 41.842 b 4 2 -3.11650
# 3 -87.726 41.844 c 1 3 0.33150
# 4 -87.719 41.849 d 3 4 0.68225
# 6 -87.722 41.838 e 2 1 -3.58675
# 7 -87.722 41.842 f 2 2 -3.00425
Note that ix, and iy could have also been found with a loop using findInterval, e.g. here's one example for the second row
findInterval(df$x[2], grid$x)
# 4
findInterval(df$y[2], grid$y)
# 2
Which matches ix and iy in df[2]
Footnote:
(1) The fourth argument of vlookup was previously called "match", but after they introduced the ribbon it was renamed to "[range_lookup]".
Solution #2 of #Ben's answer is not reproducible in other more generic examples. It happens to give the correct lookup in the example because the unique HouseType in houses appear in increasing order. Try this:
hous <- read.table(header = TRUE, stringsAsFactors = FALSE, text="HouseType HouseTypeNo
Semi 1
ECIIsHome 17
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
lookup <- unique(hous)
Bens solution#2 gives
housenames <- as.numeric(1:length(unique(hous$HouseType)))
names(housenames) <- unique(hous$HouseType)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
which when
unique(base2$HouseTypeNo[ base2$HouseType=="ECIIsHome" ])
[1] 2
when the correct answer is 17 from the lookup table
The correct way to do it is
hous <- read.table(header = TRUE, stringsAsFactors = FALSE, text="HouseType HouseTypeNo
Semi 1
ECIIsHome 17
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
housenames <- tapply(hous$HouseTypeNo, hous$HouseType, unique)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
Now the lookups are performed correctly
unique(base2$HouseTypeNo[ base2$HouseType=="ECIIsHome" ])
ECIIsHome
17
I tried to edit Bens answer but it gets rejected for reasons I cannot understand.
Starting with:
houses <- read.table(text="Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3",col.names=c("HouseType","HouseTypeNo"))
... you can use
as.numeric(factor(houses$HouseType))
... to give a unique number for each house type. You can see the result here:
> houses2 <- data.frame(houses,as.numeric(factor(houses$HouseType)))
> houses2
HouseType HouseTypeNo as.numeric.factor.houses.HouseType..
1 Semi 1 3
2 Single 2 4
3 Row 3 2
4 Single 2 4
5 Apartment 4 1
6 Apartment 4 1
7 Row 3 2
... so you end up with different numbers on the rows (because the factors are ordered alphabetically) but the same pattern.
(EDIT: the remaining text in this answer is actually redundant. It occurred to me to check and it turned out that read.table() had already made houses$HouseType into a factor when it was read into the dataframe in the first place).
However, you may well be better just to convert HouseType to a factor, which would give you all the same benefits as HouseTypeNo, but would be easier to interpret because the house types are named rather than numbered, e.g.:
> houses3 <- houses
> houses3$HouseType <- factor(houses3$HouseType)
> houses3
HouseType HouseTypeNo
1 Semi 1
2 Single 2
3 Row 3
4 Single 2
5 Apartment 4
6 Apartment 4
7 Row 3
> levels(houses3$HouseType)
[1] "Apartment" "Row" "Semi" "Single"
You could use mapvalues() from the plyr package.
Initial data:
dat <- data.frame(HouseType = c("Semi", "Single", "Row", "Single", "Apartment", "Apartment", "Row"))
> dat
HouseType
1 Semi
2 Single
3 Row
4 Single
5 Apartment
6 Apartment
7 Row
Lookup / crosswalk table:
lookup <- data.frame(type_text = c("Semi", "Single", "Row", "Apartment"), type_num = c(1, 2, 3, 4))
> lookup
type_text type_num
1 Semi 1
2 Single 2
3 Row 3
4 Apartment 4
Create the new variable:
dat$house_type_num <- plyr::mapvalues(dat$HouseType, from = lookup$type_text, to = lookup$type_num)
Or for simple replacements you can skip creating a long lookup table and do this directly in one step:
dat$house_type_num <- plyr::mapvalues(dat$HouseType,
from = c("Semi", "Single", "Row", "Apartment"),
to = c(1, 2, 3, 4))
Result:
> dat
HouseType house_type_num
1 Semi 1
2 Single 2
3 Row 3
4 Single 2
5 Apartment 4
6 Apartment 4
7 Row 3
Using merge is different from lookup in Excel as it has potential to duplicate (multiply) your data if primary key constraint is not enforced in lookup table or reduce the number of records if you are not using all.x = T.
To make sure you don't get into trouble with that and lookup safely, I suggest two strategies.
First one is to make a check on a number of duplicated rows in lookup key:
safeLookup <- function(data, lookup, by, select = setdiff(colnames(lookup), by)) {
# Merges data to lookup making sure that the number of rows does not change.
stopifnot(sum(duplicated(lookup[, by])) == 0)
res <- merge(data, lookup[, c(by, select)], by = by, all.x = T)
return (res)
}
This will force you to de-dupe lookup dataset before using it:
baseSafe <- safeLookup(largetable, house.ids, by = "HouseType")
# Error: sum(duplicated(lookup[, by])) == 0 is not TRUE
baseSafe<- safeLookup(largetable, unique(house.ids), by = "HouseType")
head(baseSafe)
# HouseType HouseTypeNo
# 1 Apartment 4
# 2 Apartment 4
# ...
Second option is to reproduce Excel behaviour by taking the first matching value from the lookup dataset:
firstLookup <- function(data, lookup, by, select = setdiff(colnames(lookup), by)) {
# Merges data to lookup using first row per unique combination in by.
unique.lookup <- lookup[!duplicated(lookup[, by]), ]
res <- merge(data, unique.lookup[, c(by, select)], by = by, all.x = T)
return (res)
}
baseFirst <- firstLookup(largetable, house.ids, by = "HouseType")
These functions are slightly different from lookup as they add multiple columns.
The lookup package can be used here:
library(lookup)
# reference data
hous <- data.frame(HouseType=c("Semi","Single","Row","Single","Apartment","Apartment","Row"),
HouseTypeNo=c(1,2,3,2,4,4,3))
# new large data with HouseType but no HouseTypeNo
largetable <- data.frame(HouseType = sample(unique(hous$HouseType), 1000, replace = TRUE))
# vector approach
largetable$num1 <- lookup(largetable$HouseType, hous$HouseType, hous$HouseTypeNo)
# dataframe approach
largetable$num2 <- vlookup(largetable$HouseType, hous, "HouseType", "HouseTypeNo")
head(largetable)
# HouseType num1 num2
# 1 Semi 1 1
# 2 Semi 1 1
# 3 Apartment 4 4
# 4 Semi 1 1
# 5 Single 2 2
# 6 Single 2 2
I have a dataset about 105000 rows and 30 columns. I have a categorical variable that I would like to assign it to a number. In Excel, I would probably do something with VLOOKUP and fill.
How would I go about doing the same thing in R?
Essentially, what I have is a HouseType variable, and I need to calculate the HouseTypeNo. Here are some sample data:
HouseType HouseTypeNo
Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3
If I understand your question correctly, here are four methods to do the equivalent of Excel's VLOOKUP and fill down using R:
# load sample data from Q
hous <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="HouseType HouseTypeNo
Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
# create a toy large table with a 'HouseType' column
# but no 'HouseTypeNo' column (yet)
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
# create a lookup table to get the numbers to fill
# the large table
lookup <- unique(hous)
HouseType HouseTypeNo
1 Semi 1
2 Single 2
3 Row 3
5 Apartment 4
Here are four methods to fill the HouseTypeNo in the largetable using the values in the lookup table:
First with merge in base:
# 1. using base
base1 <- (merge(lookup, largetable, by = 'HouseType'))
A second method with named vectors in base:
# 2. using base and a named vector
housenames <- as.numeric(1:length(unique(hous$HouseType)))
names(housenames) <- unique(hous$HouseType)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
Third, using the plyr package:
# 3. using the plyr package
library(plyr)
plyr1 <- join(largetable, lookup, by = "HouseType")
Fourth, using the sqldf package
# 4. using the sqldf package
library(sqldf)
sqldf1 <- sqldf("SELECT largetable.HouseType, lookup.HouseTypeNo
FROM largetable
INNER JOIN lookup
ON largetable.HouseType = lookup.HouseType")
If it's possible that some house types in largetable do not exist in lookup then a left join would be used:
sqldf("select * from largetable left join lookup using (HouseType)")
Corresponding changes to the other solutions would be needed too.
Is that what you wanted to do? Let me know which method you like and I'll add commentary.
I think you can also use match():
largetable$HouseTypeNo <- with(lookup,
HouseTypeNo[match(largetable$HouseType,
HouseType)])
This still works if I scramble the order of lookup.
I also like using qdapTools::lookup or shorthand binary operator %l%. It works identically to an Excel vlookup, but it accepts name arguments opposed to column numbers
## Replicate Ben's data:
hous <- structure(list(HouseType = c("Semi", "Single", "Row", "Single",
"Apartment", "Apartment", "Row"), HouseTypeNo = c(1L, 2L, 3L,
2L, 4L, 4L, 3L)), .Names = c("HouseType", "HouseTypeNo"),
class = "data.frame", row.names = c(NA, -7L))
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType),
1000, replace = TRUE)), stringsAsFactors = FALSE)
## It's this simple:
library(qdapTools)
largetable[, 1] %l% hous
The poster didn't ask about looking up values if exact=FALSE, but I'm adding this as an answer for my own reference and possibly others.
If you're looking up categorical values, use the other answers.
Excel's vlookup also allows you to match match approximately for numeric values with the 4th argument(1) match=TRUE. I think of match=TRUE like looking up values on a thermometer. The default value is FALSE, which is perfect for categorical values.
If you want to match approximately (perform a lookup), R has a function called findInterval, which (as the name implies) will find the interval / bin that contains your continuous numeric value.
However, let's say that you want to findInterval for several values. You could write a loop or use an apply function. However, I've found it more efficient to take a DIY vectorized approach.
Let's say that you have a grid of values indexed by x and y:
grid <- list(x = c(-87.727, -87.723, -87.719, -87.715, -87.711),
y = c(41.836, 41.839, 41.843, 41.847, 41.851),
z = (matrix(data = c(-3.428, -3.722, -3.061, -2.554, -2.362,
-3.034, -3.925, -3.639, -3.357, -3.283,
-0.152, -1.688, -2.765, -3.084, -2.742,
1.973, 1.193, -0.354, -1.682, -1.803,
0.998, 2.863, 3.224, 1.541, -0.044),
nrow = 5, ncol = 5)))
and you have some values you want to look up by x and y:
df <- data.frame(x = c(-87.723, -87.712, -87.726, -87.719, -87.722, -87.722),
y = c(41.84, 41.842, 41.844, 41.849, 41.838, 41.842),
id = c("a", "b", "c", "d", "e", "f")
Here is the example visualized:
contour(grid)
points(df$x, df$y, pch=df$id, col="blue", cex=1.2)
You can find the x intervals and y intervals with this type of formula:
xrng <- range(grid$x)
xbins <- length(grid$x) -1
yrng <- range(grid$y)
ybins <- length(grid$y) -1
df$ix <- trunc( (df$x - min(xrng)) / diff(xrng) * (xbins)) + 1
df$iy <- trunc( (df$y - min(yrng)) / diff(yrng) * (ybins)) + 1
You could take it one step further and perform a (simplistic) interpolation on the z values in grid like this:
df$z <- with(df, (grid$z[cbind(ix, iy)] +
grid$z[cbind(ix + 1, iy)] +
grid$z[cbind(ix, iy + 1)] +
grid$z[cbind(ix + 1, iy + 1)]) / 4)
Which gives you these values:
contour(grid, xlim = range(c(grid$x, df$x)), ylim = range(c(grid$y, df$y)))
points(df$x, df$y, pch=df$id, col="blue", cex=1.2)
text(df$x + .001, df$y, lab=round(df$z, 2), col="blue", cex=1)
df
# x y id ix iy z
# 1 -87.723 41.840 a 2 2 -3.00425
# 2 -87.712 41.842 b 4 2 -3.11650
# 3 -87.726 41.844 c 1 3 0.33150
# 4 -87.719 41.849 d 3 4 0.68225
# 6 -87.722 41.838 e 2 1 -3.58675
# 7 -87.722 41.842 f 2 2 -3.00425
Note that ix, and iy could have also been found with a loop using findInterval, e.g. here's one example for the second row
findInterval(df$x[2], grid$x)
# 4
findInterval(df$y[2], grid$y)
# 2
Which matches ix and iy in df[2]
Footnote:
(1) The fourth argument of vlookup was previously called "match", but after they introduced the ribbon it was renamed to "[range_lookup]".
Solution #2 of #Ben's answer is not reproducible in other more generic examples. It happens to give the correct lookup in the example because the unique HouseType in houses appear in increasing order. Try this:
hous <- read.table(header = TRUE, stringsAsFactors = FALSE, text="HouseType HouseTypeNo
Semi 1
ECIIsHome 17
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
lookup <- unique(hous)
Bens solution#2 gives
housenames <- as.numeric(1:length(unique(hous$HouseType)))
names(housenames) <- unique(hous$HouseType)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
which when
unique(base2$HouseTypeNo[ base2$HouseType=="ECIIsHome" ])
[1] 2
when the correct answer is 17 from the lookup table
The correct way to do it is
hous <- read.table(header = TRUE, stringsAsFactors = FALSE, text="HouseType HouseTypeNo
Semi 1
ECIIsHome 17
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3")
largetable <- data.frame(HouseType = as.character(sample(unique(hous$HouseType), 1000, replace = TRUE)), stringsAsFactors = FALSE)
housenames <- tapply(hous$HouseTypeNo, hous$HouseType, unique)
base2 <- data.frame(HouseType = largetable$HouseType,
HouseTypeNo = (housenames[largetable$HouseType]))
Now the lookups are performed correctly
unique(base2$HouseTypeNo[ base2$HouseType=="ECIIsHome" ])
ECIIsHome
17
I tried to edit Bens answer but it gets rejected for reasons I cannot understand.
Starting with:
houses <- read.table(text="Semi 1
Single 2
Row 3
Single 2
Apartment 4
Apartment 4
Row 3",col.names=c("HouseType","HouseTypeNo"))
... you can use
as.numeric(factor(houses$HouseType))
... to give a unique number for each house type. You can see the result here:
> houses2 <- data.frame(houses,as.numeric(factor(houses$HouseType)))
> houses2
HouseType HouseTypeNo as.numeric.factor.houses.HouseType..
1 Semi 1 3
2 Single 2 4
3 Row 3 2
4 Single 2 4
5 Apartment 4 1
6 Apartment 4 1
7 Row 3 2
... so you end up with different numbers on the rows (because the factors are ordered alphabetically) but the same pattern.
(EDIT: the remaining text in this answer is actually redundant. It occurred to me to check and it turned out that read.table() had already made houses$HouseType into a factor when it was read into the dataframe in the first place).
However, you may well be better just to convert HouseType to a factor, which would give you all the same benefits as HouseTypeNo, but would be easier to interpret because the house types are named rather than numbered, e.g.:
> houses3 <- houses
> houses3$HouseType <- factor(houses3$HouseType)
> houses3
HouseType HouseTypeNo
1 Semi 1
2 Single 2
3 Row 3
4 Single 2
5 Apartment 4
6 Apartment 4
7 Row 3
> levels(houses3$HouseType)
[1] "Apartment" "Row" "Semi" "Single"
You could use mapvalues() from the plyr package.
Initial data:
dat <- data.frame(HouseType = c("Semi", "Single", "Row", "Single", "Apartment", "Apartment", "Row"))
> dat
HouseType
1 Semi
2 Single
3 Row
4 Single
5 Apartment
6 Apartment
7 Row
Lookup / crosswalk table:
lookup <- data.frame(type_text = c("Semi", "Single", "Row", "Apartment"), type_num = c(1, 2, 3, 4))
> lookup
type_text type_num
1 Semi 1
2 Single 2
3 Row 3
4 Apartment 4
Create the new variable:
dat$house_type_num <- plyr::mapvalues(dat$HouseType, from = lookup$type_text, to = lookup$type_num)
Or for simple replacements you can skip creating a long lookup table and do this directly in one step:
dat$house_type_num <- plyr::mapvalues(dat$HouseType,
from = c("Semi", "Single", "Row", "Apartment"),
to = c(1, 2, 3, 4))
Result:
> dat
HouseType house_type_num
1 Semi 1
2 Single 2
3 Row 3
4 Single 2
5 Apartment 4
6 Apartment 4
7 Row 3
Using merge is different from lookup in Excel as it has potential to duplicate (multiply) your data if primary key constraint is not enforced in lookup table or reduce the number of records if you are not using all.x = T.
To make sure you don't get into trouble with that and lookup safely, I suggest two strategies.
First one is to make a check on a number of duplicated rows in lookup key:
safeLookup <- function(data, lookup, by, select = setdiff(colnames(lookup), by)) {
# Merges data to lookup making sure that the number of rows does not change.
stopifnot(sum(duplicated(lookup[, by])) == 0)
res <- merge(data, lookup[, c(by, select)], by = by, all.x = T)
return (res)
}
This will force you to de-dupe lookup dataset before using it:
baseSafe <- safeLookup(largetable, house.ids, by = "HouseType")
# Error: sum(duplicated(lookup[, by])) == 0 is not TRUE
baseSafe<- safeLookup(largetable, unique(house.ids), by = "HouseType")
head(baseSafe)
# HouseType HouseTypeNo
# 1 Apartment 4
# 2 Apartment 4
# ...
Second option is to reproduce Excel behaviour by taking the first matching value from the lookup dataset:
firstLookup <- function(data, lookup, by, select = setdiff(colnames(lookup), by)) {
# Merges data to lookup using first row per unique combination in by.
unique.lookup <- lookup[!duplicated(lookup[, by]), ]
res <- merge(data, unique.lookup[, c(by, select)], by = by, all.x = T)
return (res)
}
baseFirst <- firstLookup(largetable, house.ids, by = "HouseType")
These functions are slightly different from lookup as they add multiple columns.
The lookup package can be used here:
library(lookup)
# reference data
hous <- data.frame(HouseType=c("Semi","Single","Row","Single","Apartment","Apartment","Row"),
HouseTypeNo=c(1,2,3,2,4,4,3))
# new large data with HouseType but no HouseTypeNo
largetable <- data.frame(HouseType = sample(unique(hous$HouseType), 1000, replace = TRUE))
# vector approach
largetable$num1 <- lookup(largetable$HouseType, hous$HouseType, hous$HouseTypeNo)
# dataframe approach
largetable$num2 <- vlookup(largetable$HouseType, hous, "HouseType", "HouseTypeNo")
head(largetable)
# HouseType num1 num2
# 1 Semi 1 1
# 2 Semi 1 1
# 3 Apartment 4 4
# 4 Semi 1 1
# 5 Single 2 2
# 6 Single 2 2