Merging/Scanning in R using like operator - r

I have two data frames in these formats.
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like
where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!
Figure - Desired Data Format

Using adist basically.
sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
# Firm Year
# 1.1 Apple 1991
# 2.1 Apple 1992
# 3.1 Apple 1993
# 6.2 Apple 1996
# 4.1 Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2 IBM 1993
# 6.3 IBM 1996

Try this
df <- sapply(gsub("\\s[a-zA-Z]+\\W" , "" ,trimws(df2$Firm)),
function(x) grepl(tolower(x) ,
tolower(df1$Winner)))
l <- lapply(data.frame(df), function(x) df1$Year[x])
l
If you want the answer in data.frame use
ans <- data.frame(Firm = gsub("[0-9]+","",names(unlist(l))) ,
year = unlist(l))
row.names(ans) <- NULL
ans

Using fuzzyjoin.
(Use the second example only if the precise ordering matters.)
library(tidyverse)
library(fuzzyjoin)
# Data
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
# If the order is unimportant
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
arrange(Firm, Year) |>
select(-Winner)
#> Firm Year
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 Family Bros. Co. 1995
#> 6 Family Bros. Co. 1996
#> 7 Family Bros. Co. 1997
#> 8 IBM 1993
#> 9 IBM 1996
#> 10 Sonata Inc. 1994
# If desired output order matters
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
group_by(Firm) |>
mutate(sort = min(Year)) |>
ungroup() |>
arrange(sort, Year) |>
select(-Winner, -sort)
#> # A tibble: 10 × 2
#> Firm Year
#> <chr> <dbl>
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 IBM 1993
#> 6 IBM 1996
#> 7 Sonata Inc. 1994
#> 8 Family Bros. Co. 1995
#> 9 Family Bros. Co. 1996
#> 10 Family Bros. Co. 1997
Created on 2022-06-18 by the reprex package (v2.0.1)

Base R, sure a simpler solution exists:
# Split each winning company up into separate elements in a list
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
df1$Winner,
"\\;|\\,"
)
# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
do.call(
rbind,
lapply(
seq_len(nrow((df1))),
function(i){
transform(
df1[rep(i, length(winning_companies[[i]])),],
Winner = trimws(unlist(winning_companies[[i]]), "both")
)
}
)
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
df2$Firm
)
),
"both"
)
# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
sort(
table(
df2$firm_names_std
),
decreasing = TRUE
)
)
# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
# Derive the similarity threshold: st => integer scalar
st <- similarity_threshold
# Clean the words: firm_name => string scalar
clean_firm_names <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
firm_name_vec
)
),
"both"
)
# Function to correct the spelling at a scalar level:
# .correct_spelling_scalar => function
.correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
# Calculate the levenshtein distance between the cleaned word
# and each element in the dictionary: distance_from_dict => double vector
distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
# If we are not using a similarity threhold:
if(is.null(similarity_threshold)){
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[which.min(distance_from_dict)]
# Otherwise:
}else{
# Count the number of characters of each element in the dictionary
# vector: n => integer vector
n <- nchar(firm_dictionary)
# Calculate the ratio between the number of characters differing between
# each term in the dictionary and the total of number characters
# for a given dictionary element: dist_ratio => double vector
dist_ratio <- distance_from_dict / n
# Check if distance in ratio form is within the threshold:
# selection_idx => logical vector
selection_idx <- dist_ratio <= similarity_threshold
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[selection_idx]
}
# Resolve company name: res => string scalar
res <- head(
c(
ir,
NA_character_
),
1
)
# Explicitly define the returned object: character scalar => env
return(res)
}
# Apply function to a vector: res => character vector
res <- vapply(
clean_firm_names,
function(x){
.correct_spelling_scalar(x, firm_dictionary)
},
character(1),
USE.NAMES = FALSE
)
# Explicitly define the returned object: character vector => env
return(res)
}
# Derive the correct spelling of the firms:
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
df1_unrolled$Winner,
firm_dictionary
)
# Use the cleaned firm names to look up the formatted names in df2:
# df3 => data.frame
df3 <- transform(
df1_unrolled,
Winner = trimws(
df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
"both"
)
)
# Output result to console: data.frame => stdout(console)
df3
Data:
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))

Related

Combine fuzzy and exact merge in R

Here is my sample data:
a <- data.frame(name = c('Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana',
'Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana'),
date=c(1991,1991,1991,1991,1991,1991,
1992,1992,1992,1992,1992,1992),
price = c(10, 13, 2, 1, 15, 1,
11,15,3,2,14,4))
b <- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15))
I am left joining a to b by date and name, date is exact while name is fuzzy. I have tried stringdist_join but it only accomdates fuzzy merge.
The expected output is as follows:
c<- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15),
price = c(10, 13, 2,11,15,3))
I'd like to manipulate it under dplyr.
Using distance matrix to merge fuzzy strings
Main principle
Get the distance matrix between each unique terms of you vectors. Then, check what threshold might lead to the best results (this has to be human supervised I think).
Then, use this new correspondance table to merge your dataframes. Finallyyou can change names (i.e. adding "inc.") easier because you have "standardized" names.
With utils::adist()
I think stringdist is better because you can choose the method, but here is a base example as a suggestion on how to use this concept of distance to get the expected output.
# 1st create a matrix with the Standard Levenshtein distance between the name fields of both sources (or other method from stringdist)
dist_name_matrix <- adist(unique(a$name), unique(b$name), partial = TRUE, ignore.case = TRUE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
# lets convert this matrix to a dataframe for more visual changes, you will need to check it yourself
library(dplyr)
library(tidyr)
dist_df <- dist_name_matrix %>%
as.data.frame() %>%
tibble::rownames_to_column(., "a_name") %>%
pivot_longer(cols = 2:last_col(), names_to = "b_name", values_to = "dist") %>%
filter(dist < 2) # you might need to adapt this to your needs
# Now this can be used to merge your data i.e
a %>%
left_join(., dist_df, by = c("name" = "a_name")) %>%
right_join(., b, by = c("b_name" = "name", "date" = "date")) %>%
# added just to match your expected output
filter(!is.na(name)) %>%
select(b_name, date, qty, price)
Output:
b_name date qty price
1 Ace Co. 1991 9 10
2 Bayes INC. 1991 99 13
3 asd 1991 10 2
4 Ace Co. 1992 10 11
5 Bayes INC. 1992 105 15
6 asd 1992 15 3
Same process can be used with stringdist:
library(stringdist)
dist_name_matrix <- stringdistmatrix(unique(a$name), unique(b$name), method = "jw", useBytes = FALSE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
Then just adapt the threshold after human check i.e. filter(dist < 0.2)
agrep solution
The following function is almost surely not as general as it is supposed to be. But here it goes.
funMerge <- function(X, Y, col, col_approx, sep = "."){
other_cols.x <- setdiff(names(X), c(col, col_approx))
other_cols.y <- setdiff(names(Y), c(col, col_approx))
sp.x <- split(X, X[[col]])
sp.y <- split(Y, Y[[col]])
common_names <- intersect(names(sp.x), names(sp.y))
res <- sapply(common_names, function(sp.name){
x <- sp.x[[sp.name]]
y <- sp.y[[sp.name]]
k <- sapply(x[[col_approx]], agrep, y[[col_approx]])
k <- k[sapply(k, length) > 0]
k <- unlist(k)
i <- match(names(k), x[[col_approx]])
df_other.x <- x[k, other_cols.x, drop = FALSE]
df_other.y <- y[k, other_cols.y, drop = FALSE]
df_tmp <- data.frame(
x[k, col],
names(k),
y[k, col_approx]
)
names(df_tmp) <- c(col, col_approx, paste(col_approx, "y", sep = sep))
cbind(df_tmp, df_other.x, df_other.y)
}, simplify = FALSE)
res <- do.call(rbind, res)
row.names(res) <- NULL
res
}
funMerge(a, b, col = "date", col_approx = "name")
# date name name.y price qty
#1 1991 Ace Co Ace Co. 10 9
#2 1991 Bayes Bayes Inc. 13 99
#3 1991 asd asdf 2 10
#4 1992 Ace Co Ace CO. 11 10
#5 1992 Bayes Bayes INC. 15 105
#6 1992 asd aasdf 3 15
stringdist solution
The following function uses package stringdist to compute the Jaro-Winkler pairwise distances between the columns that need to be matched approximately.
From help('stringdist-metrics'), my emphasis.
The metric you need to choose for an application strongly depends on both the nature of the string (what does the string represent?) and the cause of dissimilarities between the strings you are measuring. For example, if you are comparing human-typed names that may contain typo's, the Jaro-Winkler distance may be of use. If you are comparing names that were written down after hearing them, a phonetic distance may be a better choice.
A more efficient algorithm would be to first split the data sets by the exact match column and then apply the method of funMerge2.
library(stringdist)
funMerge2 <- function(X, Y, col, col_approx, method = "jw", threshold = 0.2){
x <- X[[col_approx]]
y <- Y[[col_approx]]
d <- stringdistmatrix(x, y, method = method, useBytes = FALSE)
w <- which(d < threshold, arr.ind = TRUE)
Z1 <- X[w[, "row"], ]
Z2 <- Y[w[, "col"], ]
res <- cbind(Z1, Z2)
common_cols <- grep(col, names(res))
res <- res[apply(res[, common_cols], 1, function(x) x[1] == x[2]), ]
row.names(res) <- NULL
res
}
funMerge2(a, b, col = "date", col_approx = "name")
# name date price name date qty
#1 Ace Co 1991 10 Ace Co. 1991 9
#2 Bayes 1991 13 Bayes Inc. 1991 99
#3 asd 1991 2 asdf 1991 10
#4 Ace Co 1992 11 Ace CO. 1992 10
#5 Bayes 1992 15 Bayes INC. 1992 105
#6 asd 1992 3 aasdf 1992 15

How to data wrangle and mutate at column containing specific string?

It is hard to describe in words. Therefore, made a reprex
with input, output and expected output below
How can we data wrangle
1. When we function and mutate as shown below, there is ambiguity each time based on column name string
2. how can we rbind these once we have unique column names
library(tidyverse)
# Basically, "." means ",". So, better we remove . and PC and convert to Numeric
df1 <- tribble(
~`ABC sales 01.01.2019 - 01.02.2019`, ~code,
"1.019 PC", 2000, # Actually, it 1019 (remove . and PC )
"100 PC", 2101,
"3.440 PC", 2002
)
df2 <- tribble(
~`ABC sales 01.03.2019 - 01.04.2019`, ~year,
"6.019 PC", 2019,
"20 PC", 2001,
"043.440 PC", 2002
)
df3 <- tribble(
~`ABC sales 01.05.2019 - 01.06.2019`, ~year,
"1.019 PC", 2000,
"701 PC", 2101,
"6.440 PC", 2002
)
# Input data
input_df = list(df1,df2,df3)
#### function to clean data
# str_replace is used twice because
# remove PC and dot
data_read = function(file){
df_ <- df %>% #glimpse()
# Select the column to remove PC, spaces and .
# Each time, column name differs so, `ABC sales 01.01.2019 - 01.02.2019` cannot be used
mutate_at(sales_dot = str_replace(select(contains('ABC')), "PC",""),
sales = str_replace(sales_dot, "\\.",""), # name the new column so that rbind can be applied later
sales_dot = NULL, # delete the old column
vars(contains("ABC")) = NULL # delete the old column
)
df_
}
# attempt to resolve
# To clean the data from dots and PC
output_df1 <- map(input_df, data_read) # or lapply ?
# rbind
output = map(output_df1, rbind) # or lapply ?
expected_output <- df3 <- tribble(
~sales, ~year,
"1019", 2000,
"100", 2101,
"3440", 2002,
"6019", 2019,
"20", 2001,
"043440", 2002,
"1019", 2000,
"701", 2101,
"6440", 2002
)
Using purrr, dplyr and stringr, you can do:
map_df(.x = input_df, ~ .x %>%
set_names(., c("sales", "year"))) %>%
mutate(sales = str_remove_all(sales, "[. PC]"))
sales year
<chr> <dbl>
1 1019 2000
2 100 2101
3 3440 2002
4 6019 2019
5 20 2001
6 043440 2002
7 1019 2000
8 701 2101
9 6440 2002

How to write a function in R to download files and gather the data?

There are the urls I have saved and I have saved the variable names into a vector.
gapminder
if(!file.exists("./data")) {dir.create("./data")}
fileUrls <- c("https://docs.google.com/spreadsheet/pub?key=0AkBd6lyS3EmpdHo5S0J6ekhVOF9QaVhod05QSGV4T3c&output=xlsx",
"https://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj2tPLxKvvnNPA&output=xlsx",
"https://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xlsx")
var_names <- c("GDP","life_expectancy", "population")
I want to fill in the function get_clean to download and read in the excel file from the url provided and then put the data in a column with the variable name specified in var_name.
get_clean <- function(url_in, var_name){
}
I can do it in separate code, but I don't know how to write them in a function.
Such as
life_expect_url <- fileUrls[[2]]
download.file(life_expect_url, destfile = "./data/tmp.xlsx", mode = "wb")
life_expect <-read_excel("./data/tmp.xlsx")
# change the name of the first variable to country
names(life_expect)[[1]] <- "country"
life_expect <- life_expect %>%
gather(key = "year",
value = !!var_names[[2]],
-country,
na.rm = TRUE,
convert = TRUE)
head(life_expect, n = 5)
pop_url <- fileUrls[[3]]
download.file(pop_url, destfile = "./data/tmp.xlsx", mode = "wb")
pop <-read_excel("./data/tmp.xlsx")
# change the name of the first variable to country
names(pop)[[1]] <- "country"
pop <- pop %>%
gather(key = "year",
value = !!var_names[[3]],
-country,
na.rm = TRUE,
convert = TRUE)
head(pop, n= 5)
I tried this
get_clean <- function(url_in, var_name){
download.file(url_in, destfile = "./data/tmp.xlsx", mode = "wb")
a <- read_excel("./data/tmp.xlsx")
names(a)[[1]] <- "country"
a <- a %>%
gather(key = "year",
value = !!var_name,
-country,
na.rm = TRUE,
convert = TRUE)
}
out1 <- get_clean(fileUrls[1],var_names[1])
head(out1)
Is that right?
Should I use for loop?
The result should be like this:
## # A tibble: 6 x 3
## country year GDP
## <chr> <dbl> <dbl>
## 1 Algeria 1960 1280.3848
## 2 Argentina 1960 5251.8768
## 3 Australia 1960 9407.6851
## 4 Austria 1960 7434.1837
## 5 Bahamas 1960 11926.4610
In this way files will be dowloaded as temp files and the final result will be a list, which includes the three datasets.
fileUrls <- c("https://docs.google.com/spreadsheet/pub?key=0AkBd6lyS3EmpdHo5S0J6ekhVOF9QaVhod05QSGV4T3c&output=xlsx",
"https://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj2tPLxKvvnNPA&output=xlsx",
"https://docs.google.com/spreadsheet/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xlsx")
var_names <- c("GDP","life_expectancy", "population")
get_clean <- function(fileUrl, var_name){
tmpfile <- rep(tempfile(fileext = ".xlsx"), length(fileUrl))
lapply(1:length(fileUrl), function(x) {
link <- fileUrl[x]
file <- download.file(link, tmpfile[x])
file <- readxl::read_excel(tmpfile[x])
names(file)[1] <- "country"
file <- file %>%
tidyr::gather(year, !!rlang::sym(var_names[x]), -country,
na.rm = TRUE, convert = TRUE)
file
})
}
l <- get_clean(fileUrls, var_names)
l[[1]]
[[1]]
# A tibble: 7,988 x 3
country year GDP
<chr> <dbl> <dbl>
1 Algeria 1960 1280.
2 Argentina 1960 5252.
3 Australia 1960 9408.
4 Austria 1960 7434.
5 Bahamas 1960 11926.
6 Bangladesh 1960 255.
7 Barbados 1960 3397.
8 Belgium 1960 7455.
9 Belize 1960 950.
10 Benin 1960 257.
# … with 7,978 more rows
If you want to keep files stored in a specific folder after download, you just need to change the part that builds the path:
get_clean <- function(fileUrl, var_name){
filepath <- paste0("./data", var_name, filext = ".xlsx")
lapply(1:length(fileUrl), function(x) {
link <- fileUrl[x]
file <- download.file(link, filepath[x])
file <- readxl::read_excel(filepath[x])
names(file)[1] <- "country"
file <- file %>%
tidyr::gather(year, !!rlang::sym(var_names[x]), -country,
na.rm = TRUE, convert = TRUE)
file
})
}
l <- get_clean(fileUrls, var_names)
l[[1]]
Yes, you have to make a loop. But i suggest to not use a for loop. Instead, use a lapply. It's clean, faster (if correctly builded) and do not populate your environment and RAM with elements created inside loop.

Grepl and Extract the Match in R

In R I have:
library(tidyverse)
full_names <- tibble(FIRM = c("APPLE INC.", "MICROSOFT CORPORATION", "GOOGLE", "TESLA INC.", "ABBOTT LABORATORIES"),
TICKER = c("AAPL", "MSFT", "GOOGL", "TSLA", "ABT"),
ID = c(111, 222, 333, 444, 555)) # a dataset with full names of firms, including some IDs
abbr_names <- c("Abbott", "Apple", "Coca-Cola", "Pepsi, "Microsoft", "Tesla") # a vector with abbreviated names of firms
I want to check if the abbreviated names are in the full names dataset, and if true subsequently match the full_names row to the abbr_names vector, like:
[1] [2] [3] [4]
[1] Abbott ABBOTT LABORATORIES ABT 555
[2] Apple APPLE INC. AAPL 111
[3] Microsoft MICROSOFT CORPORATION MSFT 222
[4] Tesla TESLA INC. TSLA 444
Tried several str_extract and grepl functions, but could not make it work yet.
matches <- unlist(sapply(toupper(abbr_names), grep, x = full_names$FIRM, value = TRUE))
That will give you a vector with the names as abbreviations and the firms as values
names(matches)
# [1] "ABBOTT" "APPLE" "MICROSOFT" "TESLA"
c(firm_matches, use.names = FALSE)
# [1] "ABBOTT LABORATORIES" "APPLE INC." "MICROSOFT CORPORATION" "TESLA INC."
There are a variety of ways to put this together... cobbling...
From #Oscar 's comment, we get the desired output with a total of two lines of code:
matches <- unlist(sapply(toupper(abbr_names), grep, x = full_names$FIRM, value = TRUE))
tibble(ABBR_FIRM = names(matches), FIRM = matches) %>% left_join(., full_names, by = "FIRM")
how about this?
full_names$row_num <- 1:nrow(full_names)
do.call(rbind,
lapply(abbr_names,
function(x){
if(sum(grepl(x, full_names$FIRM, ignore.case = TRUE)) > 0){
row <- grepl(x, full_names$FIRM, ignore.case = TRUE) %>%
which()} else {row <- 0}
data.frame("name" = x,
"row_num" = row)})) %>%
right_join(full_names, by = "row_num")
My advise, turn on all the word's to upcase or lowercase. Is more easy to the functions as grepl make comparation.
My code:
library(tidyverse)
full_names <- tibble(FIRM = c("APPLE INC.", "MICROSOFT CORPORATION", "GOOGLE", "TESLA INC.", "ABBOTT LABORATORIES"),
TICKER = c("AAPL", "MSFT", "GOOGL", "TSLA", "ABT"),
ID = c(111, 222, 333, 444, 555)) # a dataset with full names of firms, including some IDs
abbr_names <- c("Abbott", "Apple", "Coca-Cola", "Microsoft", "Tesla") # a vector with abbreviated names of firms
Here I created a new column, the one we want to index the returns of grepl
full_names$new_column <- NA
Then, I did a loop in the name's that we want to index in the dataframe
for(i in 1:length(abbr_names)){
search_test <- grepl(tolower(substr(abbr_names[i], 0,4)), tolower(full_names$FIRM))
position <- grep("TRUE", search_test)
full_names$new_column[position] <- abbr_names[i]
}
The result is the follow dataframe:
FIRM TICKER ID new_column
1 APPLE INC. AAPL 111 Apple
2 MICROSOFT CORPORATION MSFT 222 Microsoft
3 GOOGLE GOOGL 333 NA
4 TESLA INC. TSLA 444 Tesla
5 ABBOTT LABORATORIES ABT 555 Abbott
"GOOG" is not in the abbr_names vector, so the return is NA
Another option might be eg this ...
map_int(abbr_names, ~ {
idx <- grep(., full_names$FIRM, ignore.case = TRUE)
if (length(idx) == 0) return(NA) else return(idx)
}) %>%
cbind(ABBR = abbr_names, FIRM = full_names$FIRM[.]) %>%
as.tibble() %>%
left_join(full_names, by = "FIRM") %>%
complete(FIRM)
# A tibble: 4 x 5
FIRM . ABBR TICKER ID
<chr> <chr> <chr> <chr> <dbl>
1 ABBOTT LABORATORIES 5 Abbott ABT 555
2 APPLE INC. 1 Apple AAPL 111
3 MICROSOFT CORPORATION 2 Microsoft MSFT 222
4 TESLA INC. 4 Tesla TSLA 444
Just wanted to still post it :)

Write data frame to multiple csv's in R

I have the below data frame which contains infomation about different states.
long=c(-106.61291,-106.61291,-106.61291,-81.97224,-81.97224,-81.97224,-84.4277,-84.4277,-84.4277)
lat=c(35.04333,35.04333,35.04333,33.37378,33.37378,33.37378,33.64073,33.64073,33.64073)
city=c("Albuquerque","Albuquerque","Albuquerque","Augusta","Augusta","Augusta","Atlanta","Atlanta","Atlanta")
date=c("2017-08-22","2017-08-23","2017-09-24","2017-09-28","2017-10-24","2017-09-22","2017-11-12","2017-010-14","2017-09-03")
value=c(12,10.8,18.3,12.4,43,21,12,32.1,14)
df<-data.frame(long,lat,city,date,value)
Problem:I want to write each city information in individual csv's. And each csv should look
like below.
Final output:
Albuquerque.csv
long lat city date value
1 -106.6129 35.04333 Albuquerque 2017-08-22 12.0
2 -106.6129 35.04333 Albuquerque 2017-08-23 10.8
3 -106.6129 35.04333 Albuquerque 2017-09-24 18.3
Augusta.csv
long lat city date value
1 -81.97224 33.37378 Augusta 2017-09-28 12.4
2 -81.97224 33.37378 Augusta 2017-10-24 43.0
3 -81.97224 33.37378 Augusta 2017-09-22 21.0
Atlanta.csv
long lat city date value
1 -84.4277 33.64073 Atlanta 2017-11-12 12.0
2 -84.4277 33.64073 Atlanta 2017-010-14 32.1
3 -84.4277 33.64073 Atlanta 2017-09-03 14.0
Thanks in advance!
# Split dataframe by city
split_df <- split(df, list(df$city))
# Write out separate CSV for each city
for (city in names(split_df)) {
write.csv(split_df[[city]], paste0(city, ".csv"))
}
long=c(-106.61291,-106.61291,-106.61291,-81.97224,-81.97224,-81.97224,-84.4277,-84.4277,-84.4277)
lat=c(35.04333,35.04333,35.04333,33.37378,33.37378,33.37378,33.64073,33.64073,33.64073)
city=c("Albuquerque","Albuquerque","Albuquerque","Augusta","Augusta","Augusta","Atlanta","Atlanta","Atlanta")
date=c("2017-08-22","2017-08-23","2017-09-24","2017-09-28","2017-10-24","2017-09-22","2017-11-12","2017-010-14","2017-09-03")
value=c(12,10.8,18.3,12.4,43,21,12,32.1,14)
df<-data.frame(long,lat,city,date,value)
dflist <- split(df , f = df$city)
sapply(names(dflist),
function (x) write.csv(dflist[[x]], file=paste(x, "csv", sep=".") ) )
There's a few different ways of doing this but a very quick approach to do this for all your cities at once is to take advantage of the apply family of functions in base R - specifically lapply.
long=c(-106.61291,-106.61291,-106.61291,-81.97224,-81.97224,-81.97224,-84.4277,-84.4277,-84.4277)
lat=c(35.04333,35.04333,35.04333,33.37378,33.37378,33.37378,33.64073,33.64073,33.64073)
city=c("Albuquerque","Albuquerque","Albuquerque","Augusta","Augusta","Augusta","Atlanta","Atlanta","Atlanta")
date=c("2017-08-22","2017-08-23","2017-09-24","2017-09-28","2017-10-24","2017-09-22","2017-11-12","2017-010-14","2017-09-03")
value=c(12,10.8,18.3,12.4,43,21,12,32.1,14)
df<-data.frame(long,lat,city,date,value)
# create a convenience function to split your data and export to csv
split_into_csv <- function(x) {
tmp <- df[df$city == x,]
write.csv(tmp, file = paste0(x,".csv"))}
# Apply split_into_csv over elements of list with lapply
lapply(levels(df$city), split_into_csv)
# Check output in director
dir()
[1] "Albuquerque.csv" "Atlanta.csv" "Augusta.csv"
You can do this with base R or with the dplyr package.
dplyr way:
dplyr::filter(df, city == 'Albuquerque') %>% write.csv(file = 'Albuquerque.csv', row.names = FALSE)
dplyr::filter(df, city == 'Augusta') %>% write.csv(file = 'Augusta.csv', row.names = FALSE)
dplyr::filter(df, city == 'Atlanta') %>% write.csv(file = 'Atlanta.csv', row.names = FALSE)
base R:
write.csv(df[df$city == 'Albuquerque', ], file = 'Albuquerque.csv', row.names = FALSE)
write.csv(df[df$city == 'Augusta', ], file = 'Augusta.csv', row.names = FALSE)
write.csv(df[df$city == 'Atlanta', ], file = 'Atlanta.csv', row.names = FALSE)
You can use a for loop if you start getting more cities.
for (city in c('Albuquerque', 'Augusta', 'Atlanta')) {
write.csv(df[df$city == city, ], file = paste0(city, '.csv'))
}

Resources