In R I have:
library(tidyverse)
full_names <- tibble(FIRM = c("APPLE INC.", "MICROSOFT CORPORATION", "GOOGLE", "TESLA INC.", "ABBOTT LABORATORIES"),
TICKER = c("AAPL", "MSFT", "GOOGL", "TSLA", "ABT"),
ID = c(111, 222, 333, 444, 555)) # a dataset with full names of firms, including some IDs
abbr_names <- c("Abbott", "Apple", "Coca-Cola", "Pepsi, "Microsoft", "Tesla") # a vector with abbreviated names of firms
I want to check if the abbreviated names are in the full names dataset, and if true subsequently match the full_names row to the abbr_names vector, like:
[1] [2] [3] [4]
[1] Abbott ABBOTT LABORATORIES ABT 555
[2] Apple APPLE INC. AAPL 111
[3] Microsoft MICROSOFT CORPORATION MSFT 222
[4] Tesla TESLA INC. TSLA 444
Tried several str_extract and grepl functions, but could not make it work yet.
matches <- unlist(sapply(toupper(abbr_names), grep, x = full_names$FIRM, value = TRUE))
That will give you a vector with the names as abbreviations and the firms as values
names(matches)
# [1] "ABBOTT" "APPLE" "MICROSOFT" "TESLA"
c(firm_matches, use.names = FALSE)
# [1] "ABBOTT LABORATORIES" "APPLE INC." "MICROSOFT CORPORATION" "TESLA INC."
There are a variety of ways to put this together... cobbling...
From #Oscar 's comment, we get the desired output with a total of two lines of code:
matches <- unlist(sapply(toupper(abbr_names), grep, x = full_names$FIRM, value = TRUE))
tibble(ABBR_FIRM = names(matches), FIRM = matches) %>% left_join(., full_names, by = "FIRM")
how about this?
full_names$row_num <- 1:nrow(full_names)
do.call(rbind,
lapply(abbr_names,
function(x){
if(sum(grepl(x, full_names$FIRM, ignore.case = TRUE)) > 0){
row <- grepl(x, full_names$FIRM, ignore.case = TRUE) %>%
which()} else {row <- 0}
data.frame("name" = x,
"row_num" = row)})) %>%
right_join(full_names, by = "row_num")
My advise, turn on all the word's to upcase or lowercase. Is more easy to the functions as grepl make comparation.
My code:
library(tidyverse)
full_names <- tibble(FIRM = c("APPLE INC.", "MICROSOFT CORPORATION", "GOOGLE", "TESLA INC.", "ABBOTT LABORATORIES"),
TICKER = c("AAPL", "MSFT", "GOOGL", "TSLA", "ABT"),
ID = c(111, 222, 333, 444, 555)) # a dataset with full names of firms, including some IDs
abbr_names <- c("Abbott", "Apple", "Coca-Cola", "Microsoft", "Tesla") # a vector with abbreviated names of firms
Here I created a new column, the one we want to index the returns of grepl
full_names$new_column <- NA
Then, I did a loop in the name's that we want to index in the dataframe
for(i in 1:length(abbr_names)){
search_test <- grepl(tolower(substr(abbr_names[i], 0,4)), tolower(full_names$FIRM))
position <- grep("TRUE", search_test)
full_names$new_column[position] <- abbr_names[i]
}
The result is the follow dataframe:
FIRM TICKER ID new_column
1 APPLE INC. AAPL 111 Apple
2 MICROSOFT CORPORATION MSFT 222 Microsoft
3 GOOGLE GOOGL 333 NA
4 TESLA INC. TSLA 444 Tesla
5 ABBOTT LABORATORIES ABT 555 Abbott
"GOOG" is not in the abbr_names vector, so the return is NA
Another option might be eg this ...
map_int(abbr_names, ~ {
idx <- grep(., full_names$FIRM, ignore.case = TRUE)
if (length(idx) == 0) return(NA) else return(idx)
}) %>%
cbind(ABBR = abbr_names, FIRM = full_names$FIRM[.]) %>%
as.tibble() %>%
left_join(full_names, by = "FIRM") %>%
complete(FIRM)
# A tibble: 4 x 5
FIRM . ABBR TICKER ID
<chr> <chr> <chr> <chr> <dbl>
1 ABBOTT LABORATORIES 5 Abbott ABT 555
2 APPLE INC. 1 Apple AAPL 111
3 MICROSOFT CORPORATION 2 Microsoft MSFT 222
4 TESLA INC. 4 Tesla TSLA 444
Just wanted to still post it :)
Related
I have two data frames in these formats.
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like
where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!
Figure - Desired Data Format
Using adist basically.
sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
# Firm Year
# 1.1 Apple 1991
# 2.1 Apple 1992
# 3.1 Apple 1993
# 6.2 Apple 1996
# 4.1 Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2 IBM 1993
# 6.3 IBM 1996
Try this
df <- sapply(gsub("\\s[a-zA-Z]+\\W" , "" ,trimws(df2$Firm)),
function(x) grepl(tolower(x) ,
tolower(df1$Winner)))
l <- lapply(data.frame(df), function(x) df1$Year[x])
l
If you want the answer in data.frame use
ans <- data.frame(Firm = gsub("[0-9]+","",names(unlist(l))) ,
year = unlist(l))
row.names(ans) <- NULL
ans
Using fuzzyjoin.
(Use the second example only if the precise ordering matters.)
library(tidyverse)
library(fuzzyjoin)
# Data
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
# If the order is unimportant
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
arrange(Firm, Year) |>
select(-Winner)
#> Firm Year
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 Family Bros. Co. 1995
#> 6 Family Bros. Co. 1996
#> 7 Family Bros. Co. 1997
#> 8 IBM 1993
#> 9 IBM 1996
#> 10 Sonata Inc. 1994
# If desired output order matters
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
group_by(Firm) |>
mutate(sort = min(Year)) |>
ungroup() |>
arrange(sort, Year) |>
select(-Winner, -sort)
#> # A tibble: 10 × 2
#> Firm Year
#> <chr> <dbl>
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 IBM 1993
#> 6 IBM 1996
#> 7 Sonata Inc. 1994
#> 8 Family Bros. Co. 1995
#> 9 Family Bros. Co. 1996
#> 10 Family Bros. Co. 1997
Created on 2022-06-18 by the reprex package (v2.0.1)
Base R, sure a simpler solution exists:
# Split each winning company up into separate elements in a list
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
df1$Winner,
"\\;|\\,"
)
# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
do.call(
rbind,
lapply(
seq_len(nrow((df1))),
function(i){
transform(
df1[rep(i, length(winning_companies[[i]])),],
Winner = trimws(unlist(winning_companies[[i]]), "both")
)
}
)
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
df2$Firm
)
),
"both"
)
# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
sort(
table(
df2$firm_names_std
),
decreasing = TRUE
)
)
# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
# Derive the similarity threshold: st => integer scalar
st <- similarity_threshold
# Clean the words: firm_name => string scalar
clean_firm_names <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
firm_name_vec
)
),
"both"
)
# Function to correct the spelling at a scalar level:
# .correct_spelling_scalar => function
.correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
# Calculate the levenshtein distance between the cleaned word
# and each element in the dictionary: distance_from_dict => double vector
distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
# If we are not using a similarity threhold:
if(is.null(similarity_threshold)){
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[which.min(distance_from_dict)]
# Otherwise:
}else{
# Count the number of characters of each element in the dictionary
# vector: n => integer vector
n <- nchar(firm_dictionary)
# Calculate the ratio between the number of characters differing between
# each term in the dictionary and the total of number characters
# for a given dictionary element: dist_ratio => double vector
dist_ratio <- distance_from_dict / n
# Check if distance in ratio form is within the threshold:
# selection_idx => logical vector
selection_idx <- dist_ratio <= similarity_threshold
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[selection_idx]
}
# Resolve company name: res => string scalar
res <- head(
c(
ir,
NA_character_
),
1
)
# Explicitly define the returned object: character scalar => env
return(res)
}
# Apply function to a vector: res => character vector
res <- vapply(
clean_firm_names,
function(x){
.correct_spelling_scalar(x, firm_dictionary)
},
character(1),
USE.NAMES = FALSE
)
# Explicitly define the returned object: character vector => env
return(res)
}
# Derive the correct spelling of the firms:
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
df1_unrolled$Winner,
firm_dictionary
)
# Use the cleaned firm names to look up the formatted names in df2:
# df3 => data.frame
df3 <- transform(
df1_unrolled,
Winner = trimws(
df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
"both"
)
)
# Output result to console: data.frame => stdout(console)
df3
Data:
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I have a column in a dataframe that is a character vector. I would like to add to my dataframe a column containing unique ID values/codes corresponding to each unique value in said column. Here is some toy data:
fnames <- c("joey", "joey", "joey", "jimmy", "jimmy", "tommy", "michael", "michael", "michael", "michael", "michael", "kevin", "kevin", "christopher", "aaron", "joshua", "joshua", "joshua", "arvid", "aiden", "kentavious", "lawrence", "xavier")
names <- as.data.frame(fnames)
To get the number of unique values of fnames I run:
unique_fnames <- length(unique(names$fnames))
To generate unique IDs for each unique name, I found the following function:
create_unique_ids <- function(n, seed_no = 16169, char_len = 6){
set.seed(seed_no)
pool <- c(letters, LETTERS, 0:9)
res <- character(n)
for(i in seq(n)){
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
while(this_res %in% res){
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
}
res[i] <- this_res
}
res
}
Applying create_unique_ids to unique_fnames I get the desired number of ID codes:
unique_fname_id <- create_unique_ids(unique_fnames)
My question is this:
How do I add the vector of unique_fname_id to my dataframe names? The desired result is a dataframe names with a unique_fname_id column that looks something like this:
unique_fname_id <- c("VvWMKt", "VvWMKt", "VvWMKt", "yEbpFq", "yEbpFq", "Z3xCdO"...)
where "VvWMKt" corresponds to "joey", "yEbpFq" corresponds to "jimmy" and so on. The dataframe names would be the same length as the original, just with this added column.
Is there a way to do this? All suggestions are welcome and appreciated. Thanks!
Edit: I need to keep the set.seed in the create_unique_ids function to ensure the IDs generated can be reproduced continuously.
If you want to use your function and keep the seed, you can do:
names %>%
distinct(fnames) %>%
bind_cols(unique_ID = create_unique_ids(13)) %>%
left_join(names)
You can also remove the seed (the set.seed(seed_no) line and parameter) from your function and have a simpler solution:
names %>%
group_by(fnames) %>%
mutate(unique_ID = create_unique_ids(1))
fnames unique_ID
<chr> <chr>
1 joey ea10KC
2 joey ea10KC
3 joey ea10KC
4 jimmy MD5W4d
5 jimmy MD5W4d
6 tommy xR7ozW
7 michael uuGn3h
8 michael uuGn3h
9 michael uuGn3h
10 michael uuGn3h
# ... with 13 more rows
You can also use a built-in function like stringi::stri_rand_strings, which creates random alphanumerical strings with a fixed number of characters:
library(stringi); library(dplyr)
names %>%
group_by(fnames) %>%
mutate(unique_ID = stri_rand_strings(1, 6))
A crude approach is to left join it back
library(tidyverse)
fnames <- c("joey", "joey", "joey", "jimmy", "jimmy", "tommy", "michael", "michael", "michael", "michael", "michael", "kevin", "kevin", "christopher", "aaron", "joshua", "joshua", "joshua", "arvid", "aiden", "kentavious", "lawrence", "xavier")
names <- as.data.frame(fnames)
unique_names <- names |> distinct()
unique_fnames <- length(unique(names$fnames))
create_unique_ids <- function(n, seed_no = 16169, char_len = 6){
set.seed(seed_no)
pool <- c(letters, LETTERS, 0:9)
res <- character(n)
for(i in seq(n)){
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
while(this_res %in% res){
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
}
res[i] <- this_res
}
res
}
unique_fname_id <- create_unique_ids(unique_fnames)
df_ids <- tibble(fnames = unique_names |> pull(fnames),unique_fname_id = unique_fname_id)
names |>
left_join(df_ids)
#> Joining, by = "fnames"
#> fnames unique_fname_id
#> 1 joey VvWMKt
#> 2 joey VvWMKt
#> 3 joey VvWMKt
#> 4 jimmy yEbpFq
#> 5 jimmy yEbpFq
#> 6 tommy Z3xCdO
#> 7 michael ef8YkZ
#> 8 michael ef8YkZ
#> 9 michael ef8YkZ
#> 10 michael ef8YkZ
#> 11 michael ef8YkZ
#> 12 kevin kDBFAq
#> 13 kevin kDBFAq
#> 14 christopher xR77mJ
#> 15 aaron gaaI1C
#> 16 joshua KM4dD9
#> 17 joshua KM4dD9
#> 18 joshua KM4dD9
#> 19 arvid oTLl7g
#> 20 aiden b63PnV
#> 21 kentavious csnWuE
#> 22 lawrence Ihi5VM
#> 23 xavier HfM0mX
Created on 2021-12-03 by the reprex package (v2.0.1)
I have the following data:
EDIT:
df<- data.frame(
id = c(432, 324, 322, 331, 242,443,223 ),
desc1= c("metal","steels&iron","irons\\copper", "sports material", "leather material", "durable goods", "electronic store")
,
store_names = c("ik bros","steel idrs", "kb materials", "ca pty (ltd)", "bkk stores", "k/k \\shop", "h/j & jj")
,
class = c("", "unknown","", "sports", "unknown", "unknown", "")
)
I want to search keywords from both desc1 and desc2 and assign a string value to class column. For example, keywords can be
indus_1 <- c("iron", "steel")
goods_store_1 <- c("goods", "store", "stores")
electr_1 <- c("electronic", "chips", "semiconductor")
unlabelled_1 <- c("leather")
here variable names indus_1, sports_1 and so on will be used to assign a string value to class. For instance, if "metal" keyword is found I assign indus after stripping away "_1" to class. In my approach, I am finding index of rows where keywords found and copying them to the copy of same dataframe, but this take quite long for a larger dataset, and may miss few classes as I am using \\b to find exact match. Here is the expected output:
id desc1 store_names class
432 metal ik bros
324 steels&iron steel idrs indus
322 irons\\copper kb materials indus
331 sports material ca pty (ltd) sports
242 leather material bkk stores unlabelled
443 durable goods k/k \\shop goods_store
223 electronic store h/j & jj electr
I am looking for a more efficient method to do the same, a fully dplyr version would be preferable. Thanks for suggestions.
In that case, you could do:
vars_1 <- mget(ls(pattern = '_1'))
vars_1 <- vars_1[!grepl('vars', names(vars_1))]
pat <- sub("_1", "", names(vars_1))
names(pat) <- sprintf(".*(%s).*", unlist(vars_1))
df %>%
mutate(class = str_replace_all(invoke(str_c, across(starts_with('desc'))), pat))
id desc1 desc2 class
1 432 iron and metal ik bros indus
2 324 sports material ca pty (ltd) sports
3 322 leather material bkk stores unlabelled
4 331 durable goods k/k \\shop goods_store
5 242 electronic goods h/j & jj electr
Logically my answer is similar to #Onyambu 's answer but with few tweaks.
library(tidyverse)
mget(ls(pattern = '_1')) %>%
stack() %>%
group_by(ind = sub('_1', '', ind)) %>%
summarise(values = sprintf('.*\\b(%s)\\b.*', paste0(values, collapse = '|'))) %>%
select(2, 1) %>%
deframe() -> pat
pat
#.*\\b(electronic|chips|semiconductor)\\b.* .*\\b(goods|store|stores)\\b.*
# "electr" "goods_store"
# .*\\b(iron|steel)\\b.* .*\\b(leather)\\b.*
# "indus" "unlabelled"
df %>%
mutate(class2 = str_replace_all(desc1, pat),
class2 = ifelse(desc1 == class2, '', class2))
# id desc1 store_names class class2
#1 432 metal ik bros
#2 324 steels&iron steel idrs unknown indus
#3 322 irons\\copper kb materials
#4 331 sports material ca pty (ltd) sports
#5 242 leather material bkk stores unknown unlabelled
#6 443 durable goods k/k \\shop unknown goods_store
#7 223 electronic store h/j & jj electr
For id = 322 it doesn't match indus because we are looking for an exact match. indus_1 has iron whereas desc1 column has irons.
I have 2 reproducible dataframes over here. I am trying to identify which column contain values that are similar to another column. I hope my code will take in every row and loop through every single column in df2. My code works below, but it requires fine-tuning to allow multiple matches with the same column.
df1 <- data.frame(fruit=c("Apple", "Orange", "Pear"), location = c("Japan", "China", "Nigeria"), price = c(32,53,12))
df2 <- data.frame(grocery = c("Durian", "Apple", "Watermelon"),
place=c("Korea", "Japan", "Malaysia"),
name = c("Mark", "John", "Tammy"),
favourite.food = c("Apple", "Wings", "Cakes"),
invoice = c("XD1", "XD2", "XD3"))
df <- sapply(names(df1), function(x) {
temp <- sapply(names(df2), function(y)
if(any(match(df1[[x]], df2[[y]], nomatch = FALSE))) y else NA)
ifelse(all(is.na(temp)), NA, temp[which.max(!is.na(temp))])
}
)
t1 <- data.frame(lapply(df, type.convert), stringsAsFactors=FALSE)
t1 <- data.frame(t(t1))
t1 <- cbind(newColName = rownames(t1), t1)
rownames(t1) <- 1:nrow(t1)
colnames(t1) <- c("Columns from df1", "Columns from df2")
df1
fruit location price
1 Apple Japan 32
2 Orange China 53
3 Pear Nigeria 12
df2
grocery place name favourite.food invoice
1 Durian Korea Mark Apple XD1
2 Apple Japan John Wings XD2
3 Watermelon Malaysia Tammy Cakes XD3
t1 #(OUTPUT FROM CODE ABOVE)
Columns from df1 Columns from df2
1 fruit grocery
2 location place
3 price <NA>
This is the output I hope to obtain instead:
Columns from df1 Columns from df2
1 fruit grocery, favourite.food
2 location place
3 price <NA>
Notice that the columns, "Grocery" and "favourite.food" both matches to the column "fruit", whereas my code only returns one column.
We can change the code to return all the matches instead and wrap them in one string using toString
vec <- sapply(names(df1), function(x) {
temp <- sapply(names(df2), function(y)
if(any(match(df1[[x]], df2[[y]], nomatch = FALSE))) y else NA)
ifelse(all(is.na(temp)), NA, toString(temp[!is.na(temp)]))
}
)
vec
# fruit location price
#"grocery, favourite.food" "place" NA
To convert it into dataframe, we can do
data.frame(columns_from_df1 = names(vec), columns_from_df2 = vec, row.names = NULL)
# columns_from_df1 columns_from_df2
#1 fruit grocery, favourite.food
#2 location place
#3 price <NA>
I want to split a street address into street name and street number in r.
My input data has a column that reads for example
Street.Addresses
205 Cape Road
32 Albany Street
cnr Kempston/Durban Roads
I want to split the street number and street name into two separate columns, so that it reads:
Street Number Street Name
205 Cape Road
32 Albany Street
cnr Kempston/Durban Roads
Is it in anyway possible to split the numeric value from the non numeric entries in a factor/string in R?
Thank you
you can try:
y <- lapply(strsplit(x, "(?<=\\d)\\b ", perl=T), function(x) if (length(x)<2) c("", x) else x)
y <- do.call(rbind, y)
colnames(y) <- c("Street Number", "Street Name")
hth
I'm sure that someone is going to come along with a cool regex solution with lookaheads and so on, but this might work for you:
X <- c("205 Cape Road", "32 Albany Street", "cnr Kempston/Durban Roads")
nonum <- grepl("^[^0-9]", X)
X[nonum] <- paste0(" \t", X[nonum])
X[!nonum] <- gsub("(^[0-9]+ )(.*)", "\\1\t\\2", X[!nonum])
read.delim(text = X, header = FALSE)
# V1 V2
# 1 205 Cape Road
# 2 32 Albany Street
# 3 NA cnr Kempston/Durban Roads
Here is another way:
df <- data.frame (Street.Addresses = c ("205 Cape Road", "32 Albany Street", "cnr Kempston/Durban Roads"),
stringsAsFactors = F)
new_df <- data.frame ("Street.Number" = character(),
"Street.Name" = character(),
stringsAsFactors = F)
for (i in 1:nrow (df)) {
new_df [i,"Street.Number"] <- unlist(strsplit (df[["Street.Addresses"]], " ")[i])[1]
new_df [i,"Street.Name"] <- paste (unlist(strsplit (df[["Street.Addresses"]], " ")[i])[-1], collapse = " ")
}
> new_df
Street.Number Street.Name
1 205 Cape Road
2 32 Albany Street
3 cnr Kempston/Durban Roads