Combine fuzzy and exact merge in R - r

Here is my sample data:
a <- data.frame(name = c('Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana',
'Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana'),
date=c(1991,1991,1991,1991,1991,1991,
1992,1992,1992,1992,1992,1992),
price = c(10, 13, 2, 1, 15, 1,
11,15,3,2,14,4))
b <- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15))
I am left joining a to b by date and name, date is exact while name is fuzzy. I have tried stringdist_join but it only accomdates fuzzy merge.
The expected output is as follows:
c<- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15),
price = c(10, 13, 2,11,15,3))
I'd like to manipulate it under dplyr.

Using distance matrix to merge fuzzy strings
Main principle
Get the distance matrix between each unique terms of you vectors. Then, check what threshold might lead to the best results (this has to be human supervised I think).
Then, use this new correspondance table to merge your dataframes. Finallyyou can change names (i.e. adding "inc.") easier because you have "standardized" names.
With utils::adist()
I think stringdist is better because you can choose the method, but here is a base example as a suggestion on how to use this concept of distance to get the expected output.
# 1st create a matrix with the Standard Levenshtein distance between the name fields of both sources (or other method from stringdist)
dist_name_matrix <- adist(unique(a$name), unique(b$name), partial = TRUE, ignore.case = TRUE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
# lets convert this matrix to a dataframe for more visual changes, you will need to check it yourself
library(dplyr)
library(tidyr)
dist_df <- dist_name_matrix %>%
as.data.frame() %>%
tibble::rownames_to_column(., "a_name") %>%
pivot_longer(cols = 2:last_col(), names_to = "b_name", values_to = "dist") %>%
filter(dist < 2) # you might need to adapt this to your needs
# Now this can be used to merge your data i.e
a %>%
left_join(., dist_df, by = c("name" = "a_name")) %>%
right_join(., b, by = c("b_name" = "name", "date" = "date")) %>%
# added just to match your expected output
filter(!is.na(name)) %>%
select(b_name, date, qty, price)
Output:
b_name date qty price
1 Ace Co. 1991 9 10
2 Bayes INC. 1991 99 13
3 asd 1991 10 2
4 Ace Co. 1992 10 11
5 Bayes INC. 1992 105 15
6 asd 1992 15 3
Same process can be used with stringdist:
library(stringdist)
dist_name_matrix <- stringdistmatrix(unique(a$name), unique(b$name), method = "jw", useBytes = FALSE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
Then just adapt the threshold after human check i.e. filter(dist < 0.2)

agrep solution
The following function is almost surely not as general as it is supposed to be. But here it goes.
funMerge <- function(X, Y, col, col_approx, sep = "."){
other_cols.x <- setdiff(names(X), c(col, col_approx))
other_cols.y <- setdiff(names(Y), c(col, col_approx))
sp.x <- split(X, X[[col]])
sp.y <- split(Y, Y[[col]])
common_names <- intersect(names(sp.x), names(sp.y))
res <- sapply(common_names, function(sp.name){
x <- sp.x[[sp.name]]
y <- sp.y[[sp.name]]
k <- sapply(x[[col_approx]], agrep, y[[col_approx]])
k <- k[sapply(k, length) > 0]
k <- unlist(k)
i <- match(names(k), x[[col_approx]])
df_other.x <- x[k, other_cols.x, drop = FALSE]
df_other.y <- y[k, other_cols.y, drop = FALSE]
df_tmp <- data.frame(
x[k, col],
names(k),
y[k, col_approx]
)
names(df_tmp) <- c(col, col_approx, paste(col_approx, "y", sep = sep))
cbind(df_tmp, df_other.x, df_other.y)
}, simplify = FALSE)
res <- do.call(rbind, res)
row.names(res) <- NULL
res
}
funMerge(a, b, col = "date", col_approx = "name")
# date name name.y price qty
#1 1991 Ace Co Ace Co. 10 9
#2 1991 Bayes Bayes Inc. 13 99
#3 1991 asd asdf 2 10
#4 1992 Ace Co Ace CO. 11 10
#5 1992 Bayes Bayes INC. 15 105
#6 1992 asd aasdf 3 15
stringdist solution
The following function uses package stringdist to compute the Jaro-Winkler pairwise distances between the columns that need to be matched approximately.
From help('stringdist-metrics'), my emphasis.
The metric you need to choose for an application strongly depends on both the nature of the string (what does the string represent?) and the cause of dissimilarities between the strings you are measuring. For example, if you are comparing human-typed names that may contain typo's, the Jaro-Winkler distance may be of use. If you are comparing names that were written down after hearing them, a phonetic distance may be a better choice.
A more efficient algorithm would be to first split the data sets by the exact match column and then apply the method of funMerge2.
library(stringdist)
funMerge2 <- function(X, Y, col, col_approx, method = "jw", threshold = 0.2){
x <- X[[col_approx]]
y <- Y[[col_approx]]
d <- stringdistmatrix(x, y, method = method, useBytes = FALSE)
w <- which(d < threshold, arr.ind = TRUE)
Z1 <- X[w[, "row"], ]
Z2 <- Y[w[, "col"], ]
res <- cbind(Z1, Z2)
common_cols <- grep(col, names(res))
res <- res[apply(res[, common_cols], 1, function(x) x[1] == x[2]), ]
row.names(res) <- NULL
res
}
funMerge2(a, b, col = "date", col_approx = "name")
# name date price name date qty
#1 Ace Co 1991 10 Ace Co. 1991 9
#2 Bayes 1991 13 Bayes Inc. 1991 99
#3 asd 1991 2 asdf 1991 10
#4 Ace Co 1992 11 Ace CO. 1992 10
#5 Bayes 1992 15 Bayes INC. 1992 105
#6 asd 1992 3 aasdf 1992 15

Related

Merging/Scanning in R using like operator

I have two data frames in these formats.
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like
where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!
Figure - Desired Data Format
Using adist basically.
sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
# Firm Year
# 1.1 Apple 1991
# 2.1 Apple 1992
# 3.1 Apple 1993
# 6.2 Apple 1996
# 4.1 Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2 IBM 1993
# 6.3 IBM 1996
Try this
df <- sapply(gsub("\\s[a-zA-Z]+\\W" , "" ,trimws(df2$Firm)),
function(x) grepl(tolower(x) ,
tolower(df1$Winner)))
l <- lapply(data.frame(df), function(x) df1$Year[x])
l
If you want the answer in data.frame use
ans <- data.frame(Firm = gsub("[0-9]+","",names(unlist(l))) ,
year = unlist(l))
row.names(ans) <- NULL
ans
Using fuzzyjoin.
(Use the second example only if the precise ordering matters.)
library(tidyverse)
library(fuzzyjoin)
# Data
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
# If the order is unimportant
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
arrange(Firm, Year) |>
select(-Winner)
#> Firm Year
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 Family Bros. Co. 1995
#> 6 Family Bros. Co. 1996
#> 7 Family Bros. Co. 1997
#> 8 IBM 1993
#> 9 IBM 1996
#> 10 Sonata Inc. 1994
# If desired output order matters
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
group_by(Firm) |>
mutate(sort = min(Year)) |>
ungroup() |>
arrange(sort, Year) |>
select(-Winner, -sort)
#> # A tibble: 10 × 2
#> Firm Year
#> <chr> <dbl>
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 IBM 1993
#> 6 IBM 1996
#> 7 Sonata Inc. 1994
#> 8 Family Bros. Co. 1995
#> 9 Family Bros. Co. 1996
#> 10 Family Bros. Co. 1997
Created on 2022-06-18 by the reprex package (v2.0.1)
Base R, sure a simpler solution exists:
# Split each winning company up into separate elements in a list
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
df1$Winner,
"\\;|\\,"
)
# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
do.call(
rbind,
lapply(
seq_len(nrow((df1))),
function(i){
transform(
df1[rep(i, length(winning_companies[[i]])),],
Winner = trimws(unlist(winning_companies[[i]]), "both")
)
}
)
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
df2$Firm
)
),
"both"
)
# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
sort(
table(
df2$firm_names_std
),
decreasing = TRUE
)
)
# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
# Derive the similarity threshold: st => integer scalar
st <- similarity_threshold
# Clean the words: firm_name => string scalar
clean_firm_names <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
firm_name_vec
)
),
"both"
)
# Function to correct the spelling at a scalar level:
# .correct_spelling_scalar => function
.correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
# Calculate the levenshtein distance between the cleaned word
# and each element in the dictionary: distance_from_dict => double vector
distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
# If we are not using a similarity threhold:
if(is.null(similarity_threshold)){
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[which.min(distance_from_dict)]
# Otherwise:
}else{
# Count the number of characters of each element in the dictionary
# vector: n => integer vector
n <- nchar(firm_dictionary)
# Calculate the ratio between the number of characters differing between
# each term in the dictionary and the total of number characters
# for a given dictionary element: dist_ratio => double vector
dist_ratio <- distance_from_dict / n
# Check if distance in ratio form is within the threshold:
# selection_idx => logical vector
selection_idx <- dist_ratio <= similarity_threshold
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[selection_idx]
}
# Resolve company name: res => string scalar
res <- head(
c(
ir,
NA_character_
),
1
)
# Explicitly define the returned object: character scalar => env
return(res)
}
# Apply function to a vector: res => character vector
res <- vapply(
clean_firm_names,
function(x){
.correct_spelling_scalar(x, firm_dictionary)
},
character(1),
USE.NAMES = FALSE
)
# Explicitly define the returned object: character vector => env
return(res)
}
# Derive the correct spelling of the firms:
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
df1_unrolled$Winner,
firm_dictionary
)
# Use the cleaned firm names to look up the formatted names in df2:
# df3 => data.frame
df3 <- transform(
df1_unrolled,
Winner = trimws(
df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
"both"
)
)
# Output result to console: data.frame => stdout(console)
df3
Data:
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))

R - Merging two dataframe by text

I have two datasets which I want to merge :
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
And I would like to have something like this :
1 residence mozart NA (or terrasses mozart)
2 les hesperides auteuil mirabeau NA
3 chaillot chaillot
4 jouvenet NA
5 retraite dosne retraites dosne
Here is what I did :
x = data.frame(title_df2 = matrix(ncol = 1, nrow = nrow(df1)))
for (i in nbr){
x[i, ] <- grep(df1$title[i], df2$title, value = T)
}
It does not work at all ! Even though grep(df1$title[5], df2$title, value = T) works and return "chaillot"!
If I understand correctly
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
library(dplyr)
library(fuzzyjoin)
stringdist_left_join(x = df1, y = df2, method = "jw", distance_col = "d") %>%
filter(d < 0.25) %>%
right_join(df1, by = c("title.x" = "title"))
#> Joining by: "title"
#> title.x title.y d
#> 1 residence mozart terrasses mozart 0.23863636
#> 2 chaillot chaillot 0.00000000
#> 3 retraite dosne retraites dosne 0.09206349
#> 4 les hesperides auteuil mirabeau <NA> NA
#> 5 jouvenet <NA> NA
Created on 2021-04-19 by the reprex package (v2.0.0)
The issue is that grep returns a vector of length 0 when there is no match.
grep('a', 'hello', value = TRUE)
#character(0)
If we want to make use of the same for loop, make an adjustment in the code to return NA whereever there is no match
nbr <- seq_len(nrow(df1))
for (i in nbr){
x[i, ] <- c(grep(df1$title[i], df2$title, value = TRUE), NA_character_)[1]
}
-output
x
# title_df2
#1 <NA>
#2 <NA>
#3 chaillot
#4 <NA>
#5 <NA>
You could do:
a <-Vectorize(agrep, "pattern")(df1$title, df2$title, value=TRUE)
is.na(a)<- lengths(a) == 0
cbind(df1,df2_title=unlist(a, use.names = FALSE))
title df2_title
1 residence mozart <NA>
2 les hesperides auteuil mirabeau <NA>
3 chaillot chaillot
4 jouvenet <NA>
5 retraite dosne retraites dosne
To achieve your goal, you need a matching on each word of your strings within df1 title.
As used in your example, Grep will return an output only if there is a match on the full string.
In order to do that, you'll need to grep on possible words on df1 that are also contained in df2. This can be achieved by implementing an or condition on the full word contained in each string.
nbr <- 1:nrow(x)
for (i in nbr){
pattern <- paste("\\b",unlist(strsplit(as.character(df1$title[i]), " ")), "\\b", collapse = "|", sep = "") # here you create a regex expression whereby you can check if one of the words contained in 1 is also in df2. the \\b \\b escape makes sure that there is a full match on the single word.
fitInDataFrame <- grep(pattern, as.character(df2$title), value = T) # here you grep on the constructed regex expression
x[i, ] <- ifelse(length(fitInDataFrame) == 0, NA, fitInDataFrame)
}
Here the output:
> x
title_df2
1 terrasses mozart
2 <NA>
3 chaillot
4 <NA>
5 retraites dosne
You can do a left_join(df1, df2, by = c('title' = 'title'), keep = TRUE), specifying keep = TRUE so it doesn't drop df2's join column.
Or, for this particular case, you could do this:
df1$newcol <- ifelse(df1$title %in% df2$title, df1$title, NA)
This adds a new column to df1 which is filled out by going through each title in df1, checking if that title is in df2, if so writing that title in the second column and if not writing NA in that row of the second column. You could choose to put something else there instead, like:
df1$newcol <- ifelse(df1$title %in% df2$title, 'Title in DF2', 'Not in DF2')

Iterate through two dataframes in R and compare corresponding column values

I have two data frames with text data about users:
x <- data.frame("Address_line1" = c("123 Street","21 Hill drive"),
"City" = c("Chicago","London"), "Phone" = c("123","219"))
y <- data.frame("Address_line1" = c("461 road","PO Box 123","543 Highway"),
"City" = c("Dallas","Paris","New York" ), "Phone" = c("235","542","842"))
> x
Address_line1 City Phone
1 123 Street Chicago 123
2 21 Hill drive London 219
> y
Address_line1 City Phone
1 461 road Dallas 235
2 PO Box 123 Paris 542
3 543 Highway New York 842
For each row of the x dataframe, I want to iterate over all the rows in y, compare the corresponding columns (address to address, city to city etc.) and obtain the string distance for each.
So for the first row of x, I want an output like:
[16 20 20]
Where 16 is
stringdist("123 Street","461 road", method = "lv")+
stringdist("Chicago","Dallas", method = "lv")+
stringdist("123","235", method = "lv")
20 is the sum for second row and 20 for third.
Similarly, I want a list containing nrow(y) elements for each row of x.
We can use for loop
out <- c()
for(i in seq_len(nrow(x))) {
for(j in seq_len(nrow(y))) {
x1 <- x[i,]; y1 <- y[j,]
out <- c(out, sum(unlist(Map(stringdist, x1, y1,
MoreArgs = list(method = 'lv')))))
}
}
out
#[1] 16 20 20 19 20 21
It is not clear about the expected. We can also use tidyverse methods
library(dplyr)
library(tidyr)
library(purrr)
library(stringdist)
library(stringr)
crossing(x, y, .name_repair = 'unique') %>%
rename_all(~ str_remove(., "\\.{2,}")) %>%
split.default(str_remove(names(.), "\\d+$")) %>%
map(~ pmap(.x, ~ stringdist(..1, ..2, method = 'lv'))) %>%
transpose %>%
map_dbl(~ flatten_dbl(.x) %>%
sum)
#[1] 16 20 20 19 21 20

How do I replace "k" and "m" with thousands and millions?

I have a dataframe, parsed from Coursera. One of the columns is number of students enrolled on the course. Looks like this:
df <- data.frame(uni = c("Yale", "Toronto", "NYU"), students = c("16m", "240k", "7.5k"))
uni students
1 Yale "16m"
2 Toronto "240k"
3 NYU "7.5k"
What I need to get is
uni students
1 Yale 16000000
2 Toronto 240000
3 NYU 75000
So, the main difficulty for me there is that the class of values is character, and I do not know function for replacing ks and ms, and transforming the class of column to numerics.
Please, help me!
E.g.
d$students <- dplyr::case_when(
stringr::str_detect(d$students, 'm') ~ readr::parse_number(d$students) * 1e6,
stringr::str_detect(d$students, 'k') ~ readr::parse_number(d$students) * 1e3,
TRUE ~ parse_number(d$students)
)
An option with base r:
df$students <- ifelse(grepl('m', ignore.case = TRUE, df$students), as.numeric(gsub("[$m]", "", df$students)) * 10^6,
as.numeric(gsub("[$k]", "", df$students)) * 10^3)
# uni students
# 1 Yale 16000000
# 2 Toronto 240000
# 3 NYU 7500
Using stringr and dplyr from tidyverse
library(tidyverse)
df %>%
mutate(students = case_when(
str_detect(students, "m") ~ as.numeric(str_extract(students, "[\\d\\.]+")) * 1000000,
str_detect(students, "k") ~ as.numeric(str_extract(students, "[\\d\\.]+")) * 1000,
))
# A tibble: 3 x 2
uni students
<chr> <dbl>
1 Yale 16000000
2 Toronto 240000
3 NYU 7500
Here's an approach with separate that would work for any arbitrary number of modifiers, simply keep defining them in the case_when statement.
library(dplyr)
library(tidry)
df %>%
separate(students,into = c("value","modifier"),
sep = "(?<=[\\d])(?=[^\\d.])") %>%
mutate(modifier = case_when(modifier == "b" ~ 1000000000,
modifier == "m" ~ 1000000,
modifier == "k" ~ 1000,
TRUE ~ 1),
result = as.numeric(value) * modifier)
uni value modifier result
1 Yale 16 1e+06 1.6e+07
2 Toronto 240 1e+03 2.4e+05
3 NYU 7.5 1e+03 7.5e+03
Using gsub and dplyr:
df %>% mutate(
unit=gsub("[0-9]+\\.*[0-9]*","",students), #selecting unit
value=as.numeric(gsub("([0-9]+\\.*[0-9]+).", "\\1", students)),
students=ifelse(unit=="k",1e3*value,
ifelse(unit=="m",1e6*value,
ifelse(unit=="b",1e9*value,value)))) %>%
select(-c(unit,value))
One can write a function that does the conversion, for example:
f <- function(s) {
l <- nchar(s)
x <- as.numeric(substr(s, 1, l-1))
u <- substr(s, l, l)
x * 10^(3 * match(u, c("k", "M", "G")))
}
f("2M")
f("200k")
Edit: or a little bit more generic:
f <- function(s) {
x <- as.numeric(gsub("[kMG]", "", s))
u <- gsub("[0-9.]", "", s)
if (nchar(u)) x <- x * 10^(3 * match(u, c("k", "M", "G")))
x
}
f("20")
f("2M")
f("200k")

how to extract cells from r dataframe and add as a new row

I have a following dataframe in r
Names X_1 X_2 X_3 X_4
Name Sagar II Booster
Location India No Discharge Open
Depth 19.5 start End
DOC 3.2 FPL 64
Qunatity 234 SPL 50
Now I want to extract certain cells and their corresponding values in next cell.
My desired dataframe would be
Names Values
Name Sagar II
Location India
Discharge Open
Depth 19.5
DOC 3.2
FPL 64
SPL 50
How can I do it in r?
A solution from base R.
# Create example data frame
dt <- read.table(text = "Names X_1 X_2 X_3 X_4
Name Sagar II Booster
Location India No Discharge Open
Depth 19.5 start End
DOC 3.2 FPL 64
Qunatity 234 SPL 50",
stringsAsFactors = FALSE, header = TRUE, fill = TRUE)
# A list of target keys
target_key <- c("Name", "Location", "Discharge", "Depth", "DOC", "FPL", "SPL")
# A function to extract value based on key and create a new data frame
extract_fun <- function(key, df = dt){
Row <- which(apply(dt, 1, function(x) key %in% x))
Col <- which(apply(dt, 2, function(x) key %in% x))
df2 <- data.frame(Names = key, Values = df[Row, Col + 1],
stringsAsFactors = FALSE)
df2$Values <- as.character(df2$Values)
return(df2)
}
# Apply the extract_fun
ext_list <- lapply(target_key, extract_fun)
# Combine all data frame
dt_final <- do.call(rbind, ext_list)
dt_final
Names Values
1 Name Sagar
2 Location India
3 Discharge Open
4 Depth 19.5
5 DOC 3.2
6 FPL 64
7 SPL 50
Might not be the most efficient, but works for your example:
library(dplyr)
key_value = function(extraction){
temp = matrix(NA, nrow = length(extraction), ncol = 2)
temp[,1] = extraction
for(ii in 1:nrow(temp)){
index = df %>%
as.matrix %>%
{which(. == extraction[ii], arr.ind = TRUE)}
temp[ii, 2] = index %>% {df[.[1], .[2]+1]}
}
return(data.frame(Names = temp[,1], Values = temp[,2]))
}
Result:
> vec = c("Name", "Location", "Discharge", "Depth", "DOC", "FPL", "SPL")
> key_value(vec)
Names Values
1 Name SagarII
2 Location India
3 Discharge Open
4 Depth 19.5
5 DOC 3.2
6 FPL 64
7 SPL 50
Data:
df = read.table(text = "Names X_1 X_2 X_3 X_4
Name SagarII Booster NA NA
Location India No Discharge Open
Depth 19.5 start End NA
DOC 3.2 FPL 64 NA
Qunatity 234 SPL 50 NA", header = TRUE, stringsAsFactors = FALSE)

Resources