Summing over columns with if statement in R - r

I have a data that looks like this.
bankname bankid year totass corresbankname1 corresbankloc1 corresdepoin1 corresbankname2 corresbankloc2 corresdepoin2 corresbankname3 corresbankloc3 corresdepoin3
BankA 1 1881 244789 First Bank New York 7250.32 Third National Bank Philadelphia 20218.2 Commercial Bank Philadelphia 29513.4
BankB 2 1881 195755 National Bank Pittsburgh 10243.6 Union Trust Company New York 1851.51 NA NA NA
Bankc 3 1881 107736 Mechanics' Bank New York 13357.8 Wyoming Bank Wilkes-Barre 17761.2 NA NA NA
BankD 4 1881 170600 Commonwealth Bank Philadelphia 3.35 Seventh National Bank Philadelphia 2 NA NA NA
BankE 5 1881 320000 National Bank New York 351266 Mechanics' Bank New York 314012 National Park Bank New York 206580
This can be replicated by
bankname <- c("The Anchor Savings Bank of Pittsburgh","The Arsenal Bank","The Ashley Savings Bank","The Bank of America of Philadelphia","The Bank of Pittsburgh")
bankid <- c( 1, 2, 3, 4, 5)
year<- c( 1881, 1881, 1881, 1881, 1881)
totass <- c(244789, 195755, 107736, 170600, 32000000)
corresbankname1 <- c("First National Bank","National Bank of Commerce","Mechanics' National Bank","Commonwealth National Bank","National Bank of Commerce")
corresbankloc1 <-c("Philadelphia","Pittsburgh","New York","Philadelphia","New York")
corresdepoin1<-c(7250.32,10243.6,13357.8,3.35,351266)
corresbankname2 <- c("Third National Bank","Union Trust Company","Wyoming National Bank","Seventh National Bank","Mechanics' National Bank")
corresbankloc2<-c("New York","New York","Wilkes-Barre","Philadelphia","New York")
corresdepoin2<-c(20218.2,1851.51,17761.2,2,314012)
corresbankname3<-c("Commercial National Bank",NA,NA,NA,"National Park Bank")
corresbankloc3<-c("Philadelphia",NA,NA,NA,"New York")
corresdepoin3<-c(29513.4,NA,NA,NA,206580)
bankdata<-data.frame(bankname, bankid,year,totass,corresbankname1,corresbankloc1,corresdepoin1,corresbankname2,corresbankloc2,corresdepoin2,corresbankname3,corresbankloc3,corresdepoin3)
This dataset shows the amount(corresdepoin) each bank invested in other banks (corresbankname) and its location (corresbankloc). I have 43 corresbankname, corresbankloc and corresdepoin variables.
Since these banks invest in multiple banks in the same city, I would like to know the total amount of investments in each city. Hence, I would like to generate a new column variable called "total_New York" and sum amount indicated by corresdepoin if correspobankloc is New York. How can I loop over 43 variables?
For instance, BankE has $351266 (corresdepoin1) at National Bank(corresbankname1) in New York(corresbankloc1), $314012 at Mechenics' Bank in New York, and $206580 at National Park Bank in New York. I want a new column called "total deposits in New York" showing that the total amount of investments in banks located in New York is $871858. Hence, what I want is a conditional statement that loops over columns (corresponbankloc) and checks if this is New York or others, and then sum associated values in "corresdepoin" to get aggregated amount of total investment in that city for each "bankname."
Also, in stata, if I want to do this for multiple cities, I would generate
local cities "New York" "Philadelphia" "Pittsburgh"
and loop over them. Is there a similar function in R?
Thank you in advance.

Another option would be to reshape the dataset. Using dplyr. You may create a function to output a subset of cities or for the whole city in the data. I don't know if this is efficient.
library(dplyr)
fun1 <- function(data, city, byloc = TRUE, allcity = TRUE) {
data1 <- reshape(data, idvar = "bankname", varying = list(grep("corresdepoin",
colnames(data)), grep("corresbankloc", colnames(data))), timevar = "Bankloc",
direction = "long", v.names = c("corresdepoin", "corresbankloc"))
data1 <- data1[!is.na(data1$corresbankloc), ]
row.names(data1) <- 1:nrow(data1)
funlocorNot <- function(data, city, grouploc = TRUE) {
dataF <- data %>%
filter(corresbankloc %in% city)
if (grouploc) {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
funallCity <- function(data, grouploc = TRUE) {
if (grouploc) {
dataF1 <- data %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- data %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
if (!allcity) {
if (byloc) {
funlocorNot(data1, city, TRUE)
}
else {
funlocorNot(data1, city, FALSE)
}
}
else {
if (byloc) {
funallCity(data1, TRUE)
}
else {
funallCity(data1, FALSE)
}
}
}
as.data.frame(fun1(bankdata, "New York", byloc=TRUE, allcity=FALSE))
# bankname corresbankloc Bankloc
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 2
#2 The Arsenal Bank Totalbylocation_New York 2
#3 The Ashley Savings Bank Totalbylocation_New York 1
#4 The Bank of Pittsburgh Totalbylocation_New York 1
#5 The Bank of Pittsburgh Totalbylocation_New York 2
#6 The Bank of Pittsburgh Totalbylocation_New York 3
# Total
#1 20218.20
#2 1851.51
#3 13357.80
#4 351266.00
#5 314012.00
#6 206580.00
as.data.frame(fun1(bankdata, "New York", byloc=FALSE, allcity=FALSE))
# bankname corresbankloc Total
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 20218.20
#2 The Arsenal Bank Totalbylocation_New York 1851.51
#3 The Ashley Savings Bank Totalbylocation_New York 13357.80
#4 The Bank of Pittsburgh Totalbylocation_New York 871858.00
as.data.frame(fun1(bankdata, c("New York", "Pittsburgh"), byloc=FALSE, allcity=FALSE))
as.data.frame(fun1(bankdata, byloc=TRUE, allcity=TRUE))

Related

Finding rows that have the minimum of a specific factor group

I'm am attempting to find the minimum incomes from the state.x77 dataset based on the state.region variable.
df1 <- data.frame(state.region,state.x77,row.names = state.name)
tapply(state.x77,state.region,min)
I am trying to get it to output which state has the lowest income for X region eg for south Alabama would be the lowest income. Im trying to use tapply but I keep getting an error saying
Error in tapply(state.x77, state.region, min) :
arguments must have same length
What is the issue?
Here is a solution. First get the vector of incomes and make of it a named vector. Then use tapply to get the names of the minima incomes.
state <- setNames(state.x77[, "Income"], rownames(state.x77))
tapply(state, state.region, function(x) names(x)[which.min(x)])
# Northeast South North Central West
# "Maine" "Mississippi" "South Dakota" "New Mexico"
The following, more complicated, code will output state names, regions and incomes.
df1 <- data.frame(
State = rownames(state.x77),
Income = state.x77[, "Income"],
Region = state.region
)
merge(aggregate(Income ~ Region, df1, min), df1)[c(3, 1, 2)]
# State Region Income
#1 South Dakota North Central 4167
#2 Maine Northeast 3694
#3 Mississippi South 3098
#4 New Mexico West 3601
And another solution with aggregate but avoiding merge.
agg <- aggregate(Income ~ Region, df1, min)
i <- match(agg$Income, df1$Income)
data.frame(
State = df1$State[i],
Region = df1$Region[i],
Income = df1$Income[i]
)
# State Region Income
#1 Maine Northeast 3694
#2 Mississippi South 3098
#3 South Dakota North Central 4167
#4 New Mexico West 3601
You can also use this solution:
library(dplyr)
library(tibble)
state2 %>%
rownames_to_column() %>%
bind_cols(state.region) %>%
rename(State = rowname,
Region = ...10) %>%
group_by(Region, State) %>%
summarise(Income = sum(Income)) %>% arrange(desc(Income)) %>%
slice_tail(n = 1)
# A tibble: 4 x 3
# Groups: Region [4]
Region State Income
<fct> <chr> <dbl>
1 Northeast Maine 3694
2 South Mississippi 3098
3 North Central South Dakota 4167
4 West New Mexico 3601

Cannot change colnames inside function in R

Here is my data -
library(data.table)
basefile2 = data.table(States = c("California","California", "California", "Texas","Texas","Texas", "Ohio", "Ohio", "Ohio"),
district = c("district1", "district2", "district3", "district4", "district5", "district6", "district7","district8", "district9"),
Cities = c("LA", "California City", "San Fran", "Houston", "Dallas", "Austin", "Columbus", "Cleaveland", "Wooster"))
And here is my code -
basefile2[, Consideration := "N"] # initialize the column
Market <- function(state, level, market){
for(i in market)
{
basefile2 <<- basefile2[States == state & get(level) %in% i, Level := paste(i)]
}
names(basefile2)[length(names(basefile2))]<- paste0("NEW_ ",level)
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
Here is my output -
States district Cities Consideration Level
1: California district1 LA N LA
2: California district2 California City N California City
3: California district3 San Fran N <NA>
4: Texas district4 Houston N district4
5: Texas district5 Dallas N district5
6: Texas district6 Austin N Austin
7: Ohio district7 Columbus N <NA>
8: Ohio district8 Cleaveland N <NA>
9: Ohio district9 Wooster N <NA>
The output I need is -
The renaming of columns is not happening via my code, I tried using data.table's setnames() too but that too does not give me my required output. Where am I going wrong?
data.table objects are already referential (not semantic), so you don't need to <<- globally assign.
changing names can be done either in-place (no need for "Level" placeholder) or after-the-fact with setnames. The problem with using a placeholder, though, is that within the first call in the for loop, a new column is created (regardless if it already exists) and then setnames(basefile2, "Level", paste0("NEW_", level)) might produce a duplicate name (which is possible but ... odd).
Market <- function(state, level, market){
for (i in market) {
basefile2[States == state & get(level) %in% i,
paste0("NEW_", level) := paste(i) ]
}
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
basefile2
# States district Cities Consideration NEW_Cities NEW_district
# 1: California district1 LA N LA <NA>
# 2: California district2 California City N California City <NA>
# 3: California district3 San Fran N <NA> <NA>
# 4: Texas district4 Houston N <NA> district4
# 5: Texas district5 Dallas N Dallas district5
# 6: Texas district6 Austin N Austin <NA>
# 7: Ohio district7 Columbus N <NA> <NA>
# 8: Ohio district8 Cleaveland N <NA> <NA>
# 9: Ohio district9 Wooster N <NA> <NA>

R column mapping

How to map column of one CSV file to column of another CSV file in R. If both are in same data type.
For example first column of data frame A consist some text with country name in it. While column of second data frame B contains a standard list of all country .Now I have to map all rows of first data frame with standard country column.
For example column (location) of data frame A consist 10000 rows of data like this
Sydney, Australia
Aarhus C, Central Region, Denmark
Auckland, New Zealand
Mumbai Area, India
Singapore
df1 <- data.frame(col1 = 1:5, col2=c("Sydney, Australia", "Aarhus C, Central Region, Denmark", "Auckland, New Zealand", "Mumbai Area, India", "Singapore"))
Now I have another column (country) of data frame B as
India
USA
New Zealand
UK
Singapore
Denmark
China
df2 <- data.frame(col1=1:7, col2=c("India", "USA", "New Zealand", "UK", "Singapore", "Denmark", "China"))
If location column matches with Country column then, I want to replace that location with country name otherwise it will remain as it is. Sample output is as
Sydney, Australia
Denmark
New Zealand
India
Singapore
Initially, it looked like a trivial question but it's not. This approach works like this:
1. We convert the location string into vector using unlist, strsplit.
2. Then we check if any string in the vector is available in country column. If it is available, we store the country name in res and if not we store notfound.
2. Finally, we check if res contains a country name or not.
df1 <- data.frame(location = c('Sydney, Australia',
'Aarhus C, Central Region, Denmark',
'Auckland, New Zealand',
'Mumbai Area, India',
'Singapore'),stringsAsFactors = F)
df2 <- data.frame(country = c('India',
'USA',
'New Zealand',
'UK',
'Singapore',
'Denmark',
'China'),stringsAsFactors = F)
get_values <- function(i)
{
val <- unlist(strsplit(i, split = ','))
val <- sapply(val, str_trim)
res <- c()
for(j in val)
{
if(j %in% df2$country) res <- append(res, j)
else res <- append(res, 'notfound')
}
if(all(res == 'notfound')) return (i)
else return (res[res!='notfound'])
}
df1$location2 <- sapply(df1$location, get_values)
location location2
1 Sydney, Australia Sydney, Australia
2 Aarhus C, Central Region, Denmark Denmark
3 Auckland, New Zealand New Zealand
4 Mumbai Area, India India
5 Singapore Singapore
A solution using tidyverse. First, please convert your col2 to character by setting stringsAsFactors = FALSE because that is easier to work with.
We can use str_extract to extract the matched country name, and then create a new col2 with mutate and ifelse.
df3 <- df1 %>%
mutate(Country = str_extract(col2, paste0(df2$col2, collapse = "|")),
col2 = ifelse(is.na(Country), col2, Country)) %>%
select(-Country)
df3
# col1 col2
# 1 1 Sydney, Australia
# 2 2 Denmark
# 3 3 New Zealand
# 4 4 India
# 5 5 Singapore
We can also start with df1, use separate_rows to separate the country name. After that, use semi_join to check if the country names are in df2. Finally, we can combine the data frame with the original df1 by rows, and then filter the first one for each id in col1. df3 is the final output.
library(tidyverse)
df3 <- df1 %>%
separate_rows(col2, sep = ", ") %>%
semi_join(df2, by = "col2") %>%
bind_rows(df1) %>%
group_by(col1) %>%
slice(1) %>%
ungroup() %>%
arrange(col1)
df3
# # A tibble: 5 x 2
# col1 col2
# <int> <chr>
# 1 1 Sydney, Australia
# 2 2 Denmark
# 3 3 New Zealand
# 4 4 India
# 5 5 Singapore
DATA
df1 <- data.frame(col1 = 1:5,
col2=c("Sydney, Australia", "Aarhus C, Central Region, Denmark", "Auckland, New Zealand", "Mumbai Area, India", "Singapore"),
stringsAsFactors = FALSE)
df2 <- data.frame(col1=1:7,
col2=c("India", "USA", "New Zealand", "UK", "Singapore", "Denmark", "China"),
stringsAsFactors = FALSE)
If you are looking for the countries, and they come after the cities then you can do something like this.
transform(df1,col3= sub(paste0(".*,\\s*(",paste0(df2$col2,collapse="|"),")"),"\\1",col2))
col1 col2 col3
1 1 Sydney, Australia Sydney, Australia
2 2 Aarhus C, Central Region, Denmark Denmark
3 3 Auckland, New Zealand New Zealand
4 4 Mumbai Area, India India
5 5 Singapore Singapore
Breakdown:
> A=sub(".*,\\s(.*)","\\1",df1$col2)
> B=sapply(A,grep,df2$col2,value=T)
> transform(df1,col3=replace(A,!lengths(B),col2[!lengths(B)]))
col1 col2 col3
1 1 Sydney, Australia Sydney, Australia
2 2 Aarhus C, Central Region, Denmark Denmark
3 3 Auckland, New Zealand New Zealand
4 4 Mumbai Area, India India
5 5 Singapore Singapore

Merge two datasets

I create a node list as follows:
name <- c("Joe","Frank","Peter")
city <- c("New York","Detroit","Maimi")
age <- c(24,55,65)
node_list <- data.frame(name,age,city)
node_list
name age city
1 Joe 24 New York
2 Frank 55 Detroit
3 Peter 65 Maimi
Then I create an edge list as follows:
from <- c("Joe","Frank","Peter","Albert")
to <- c("Frank","Albert","James","Tony")
to_city <- c("Detroit","St. Louis","New York","Carson City")
edge_list <- data.frame(from,to,to_city)
edge_list
from to to_city
1 Joe Frank Detroit
2 Frank Albert St. Louis
3 Peter James New York
4 Albert Tony Carson City
Notice that the names in the node list and edge list do not overlap 100%. I want to create a master node list of all the names, capturing city information as well. This is my dplyr attempt to do this:
new_node <- edge_list %>%
gather("from_to", "name", from, to) %>%
distinct(name) %>%
full_join(node_list)
new_node
name age city
1 Joe 24 New York
2 Frank 55 Detroit
3 Peter 65 Maimi
4 Albert NA <NA>
5 James NA <NA>
6 Tony NA <NA>
I need to figure out how to add to_city information. What do I need to add to my dplyr code to make this happen? Thanks.
Join twice, once on to and once on from, with the irrelevant columns subsetted out:
library(dplyr)
node_list <- data_frame(name = c("Joe", "Frank", "Peter"),
city = c("New York", "Detroit", "Maimi"),
age = c(24, 55, 65))
edge_list <- data_frame(from = c("Joe", "Frank", "Peter", "Albert"),
to = c("Frank", "Albert", "James", "Tony"),
to_city = c("Detroit", "St. Louis", "New York", "Carson City"))
node_list %>%
full_join(select(edge_list, name = to, city = to_city)) %>%
full_join(select(edge_list, name = from))
#> Joining, by = c("name", "city")
#> Joining, by = "name"
#> # A tibble: 6 x 3
#> name city age
#> <chr> <chr> <dbl>
#> 1 Joe New York 24.
#> 2 Frank Detroit 55.
#> 3 Peter Maimi 65.
#> 4 Albert St. Louis NA
#> 5 James New York NA
#> 6 Tony Carson City NA
In this case the second join doesn't do anything because everybody is already included, but it would insert anyone who only existed in the from column.

R: Mission impossible? How to assign "New York" to a county

I run into problems assigning a county to some city places. When querying via the acs package
> geo.lookup(state = "NY", place = "New York")
state state.name county.name place place.name
1 36 New York <NA> NA <NA>
2 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city
3 36 New York Oneida County 51011 New York Mills village
, you can see that "New York", for instance, has a bunch of counties. So do Los Angeles, Portland, Oklahoma, Columbus etc. How can such data be assigned to a "county"?
Following code is currently used to match "county.name" with the corresponding county FIPS code. Unfortunately, it only works for cases of only one county name output in the query.
Script
dat <- c("New York, NY","Boston, MA","Los Angeles, CA","Dallas, TX","Palo Alto, CA")
dat <- strsplit(dat, ",")
dat
library(tigris)
library(acs)
data(fips_codes) # FIPS codes with state, code, county information
GeoLookup <- lapply(dat,function(x) {
geo.lookup(state = trimws(x[2]), place = trimws(x[1]))[2,]
})
df <- bind_rows(GeoLookup)
#Rename cols to match
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
# Here is a problem, because it works with one item in "county.name" but not more than one (see output below).
df <- df %>% left_join(fips_codes, by = c("state.name", "county.name"))
df
Returns:
state state.name county.name place place.name state.abb statefips countyfips
1 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city <NA> <NA> <NA>
2 25 Massachusetts Suffolk County 7000 Boston city MA 25 025
3 6 California Los Angeles County 20802 East Los Angeles CDP CA 06 037
4 48 Texas Collin County, Dallas County, Denton County, Kaufman County, Rockwall County 19000 Dallas city <NA> <NA> <NA>
5 6 California San Mateo County 20956 East Palo Alto city CA 06 081
In order to retain data, the left_join might better be matched as "look for county.name that contains place.name (without the appending xy city in the name), or choose the first item by default. It would be great to see how this could be done.
In general: I assume, there's no better way than this approach?
Thanks for your help!
What about something like the code below to create a "long" data frame for joining. We use the tidyverse pipe operator to chain operations. strsplit returns a list, which we unnest to stack the list values (the county names that go with each combination of state.name and place.name) into a long data frame where each county.name now gets its own row.
library(tigris)
library(acs)
library(tidyverse)
dat = geo.lookup(state = "NY", place = "New York")
state state.name county.name place place.name
1 36 New York <NA> NA <NA>
2 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city
3 36 New York Oneida County 51011 New York Mills village
dat = dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest
state state.name place place.name county.name
<chr> <chr> <int> <chr> <chr>
1 36 New York NA <NA> <NA>
2 36 New York 51000 New York city Bronx County
3 36 New York 51000 New York city Kings County
4 36 New York 51000 New York city New York County
5 36 New York 51000 New York city Queens County
6 36 New York 51000 New York city Richmond County
7 36 New York 51011 New York Mills village Oneida County
UPDATE: Regarding the second question in your comment, assuming you have the vector of metro areas already, how about this:
dat <- c("New York, NY","Boston, MA","Los Angeles, CA","Dallas, TX","Palo Alto, CA")
df <- map_df(strsplit(dat, ", "), function(x) {
geo.lookup(state = x[2], place = x[1])[-1, ] %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest
})
df
state state.name place place.name county.name
1 36 New York 51000 New York city Bronx County
2 36 New York 51000 New York city Kings County
3 36 New York 51000 New York city New York County
4 36 New York 51000 New York city Queens County
5 36 New York 51000 New York city Richmond County
6 36 New York 51011 New York Mills village Oneida County
7 25 Massachusetts 7000 Boston city Suffolk County
8 25 Massachusetts 7000 Boston city Suffolk County
9 6 California 20802 East Los Angeles CDP Los Angeles County
10 6 California 39612 Lake Los Angeles CDP Los Angeles County
11 6 California 44000 Los Angeles city Los Angeles County
12 48 Texas 19000 Dallas city Collin County
13 48 Texas 19000 Dallas city Dallas County
14 48 Texas 19000 Dallas city Denton County
15 48 Texas 19000 Dallas city Kaufman County
16 48 Texas 19000 Dallas city Rockwall County
17 48 Texas 40516 Lake Dallas city Denton County
18 6 California 20956 East Palo Alto city San Mateo County
19 6 California 55282 Palo Alto city Santa Clara County
UPDATE 2: If I understand your comments, for cities (actually place names in the example) with more than one county, we want only the county that includes the same name as the city (for example, New York County in the case of New York city), or the first county in the list otherwise. The following code selects a county with the same name as the city or, if there isn't one, the first county for that city. You might have to tweak it a bit to make it work for the entire U.S. For example, for it to work for Louisiana, you might need gsub(" County| Parish"... instead of gsub(" County"....
map_df(strsplit(dat, ", "), function(x) {
geo.lookup(state = x[2], place = x[1])[-1, ] %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
slice(max(1, which(grepl(sub(" [A-Za-z]*$","", place.name), gsub(" County", "", county.name))), na.rm=TRUE))
})
state state.name place place.name county.name
<chr> <chr> <int> <chr> <chr>
1 36 New York 51000 New York city New York County
2 36 New York 51011 New York Mills village Oneida County
3 25 Massachusetts 7000 Boston city Suffolk County
4 6 California 20802 East Los Angeles CDP Los Angeles County
5 6 California 39612 Lake Los Angeles CDP Los Angeles County
6 6 California 44000 Los Angeles city Los Angeles County
7 48 Texas 19000 Dallas city Dallas County
8 48 Texas 40516 Lake Dallas city Denton County
9 6 California 20956 East Palo Alto city San Mateo County
10 6 California 55282 Palo Alto city Santa Clara County
Could you prep the data by using something like the below code?
new_york_data <- geo.lookup(state = "NY", place = "New York")
prep_data <- function(full_data){
output <- data.frame()
for(row in 1:nrow(full_data)){
new_rows <- replicateCounty(full_data[row, ])
output <- plyr::rbind.fill(output, new_rows)
}
return(output)
}
replicateCounty <- function(row){
counties <- str_trim(unlist(str_split(row$county.name, ",")))
output <- data.frame(state = row$state,
state.name = row$state.name,
county.name = counties,
place = row$place,
place.name = row$place.name)
return(output)
}
prep_data(new_york_data)
It's a little messy and you'll need the plyr and stringr packages. Once you prep the data, you should be able to join on it

Resources