Cannot change colnames inside function in R - r

Here is my data -
library(data.table)
basefile2 = data.table(States = c("California","California", "California", "Texas","Texas","Texas", "Ohio", "Ohio", "Ohio"),
district = c("district1", "district2", "district3", "district4", "district5", "district6", "district7","district8", "district9"),
Cities = c("LA", "California City", "San Fran", "Houston", "Dallas", "Austin", "Columbus", "Cleaveland", "Wooster"))
And here is my code -
basefile2[, Consideration := "N"] # initialize the column
Market <- function(state, level, market){
for(i in market)
{
basefile2 <<- basefile2[States == state & get(level) %in% i, Level := paste(i)]
}
names(basefile2)[length(names(basefile2))]<- paste0("NEW_ ",level)
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
Here is my output -
States district Cities Consideration Level
1: California district1 LA N LA
2: California district2 California City N California City
3: California district3 San Fran N <NA>
4: Texas district4 Houston N district4
5: Texas district5 Dallas N district5
6: Texas district6 Austin N Austin
7: Ohio district7 Columbus N <NA>
8: Ohio district8 Cleaveland N <NA>
9: Ohio district9 Wooster N <NA>
The output I need is -
The renaming of columns is not happening via my code, I tried using data.table's setnames() too but that too does not give me my required output. Where am I going wrong?

data.table objects are already referential (not semantic), so you don't need to <<- globally assign.
changing names can be done either in-place (no need for "Level" placeholder) or after-the-fact with setnames. The problem with using a placeholder, though, is that within the first call in the for loop, a new column is created (regardless if it already exists) and then setnames(basefile2, "Level", paste0("NEW_", level)) might produce a duplicate name (which is possible but ... odd).
Market <- function(state, level, market){
for (i in market) {
basefile2[States == state & get(level) %in% i,
paste0("NEW_", level) := paste(i) ]
}
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
basefile2
# States district Cities Consideration NEW_Cities NEW_district
# 1: California district1 LA N LA <NA>
# 2: California district2 California City N California City <NA>
# 3: California district3 San Fran N <NA> <NA>
# 4: Texas district4 Houston N <NA> district4
# 5: Texas district5 Dallas N Dallas district5
# 6: Texas district6 Austin N Austin <NA>
# 7: Ohio district7 Columbus N <NA> <NA>
# 8: Ohio district8 Cleaveland N <NA> <NA>
# 9: Ohio district9 Wooster N <NA> <NA>

Related

Previous marking nullified while using ifelse condition in R

Here is my data -
library(data.table)
basefile2 = data.table(States = c("California","California", "California", "Texas","Texas","Texas", "Ohio", "Ohio", "Ohio"),
Cities = c("LA", "California City", "San Fran", "Houston", "Dallas", "Austin", "Columbus", "Cleaveland", "Wooster"))
And here is my code -
Market = function(state, city){
if (missing(state)) stop("Enter State",
call. = FALSE)
if (missing(city)) stop("Enter City(ies)",
call. = FALSE)
basefile2 <<- basefile2[, "Consideration" := ifelse(States == state & Cities %in% city, "Y",
ifelse("Consideration" %in% colnames(basefile) & "Consideration" == "Y", "Y", "N"))]
}
Market(state = "California",
city = c("LA", "California City"))
Market(state = "Texas",
city = c("Dallas", "Austin"))
The previous marking in the consideration column when state was California is getting nullified. Yes, I need to input different states in separate functions to due to certain input constraints
Here is my output
States Cities Consideration
1: California LA N
2: California California City N
3: California San Fran N
4: Texas Houston N
5: Texas Dallas Y
6: Texas Austin Y
7: Ohio Columbus N
8: Ohio Cleaveland N
9: Ohio Wooster N
Where as, the output I need is the consideration column must have "Y" in the California City, LA, Austin & Dallas.
One option is to add the "Consideration" column to the data.table at the start, and then use that as a condition to update within the function so that previous updates are not replaced.
library(data.table)
basefile2 <- data.table(...) # as you had
basefile2[, Consideration := "N"] # initialize the column
Market <- function(state, city){
basefile2 <<- basefile2[Consideration=="N", # Only update if this is "N"
"Consideration" := ifelse(States == state & Cities %in% city, "Y", "N")]
}
Or maybe like this:
Market <- function(state, city){
basefile2 <<- basefile2[States == state & Cities %in% city, Consideration := "Y"]
}
Market(state = "California", city = c("LA", "California City"))
Market(state = "Texas", city = c("Dallas", "Austin"))
basefile2
States Cities Consideration
1: California LA Y
2: California California City Y
3: California San Fran N
4: Texas Houston N
5: Texas Dallas Y
6: Texas Austin Y
7: Ohio Columbus N
8: Ohio Cleaveland N
9: Ohio Wooster N

Need to ID states from mixed names /IDs in location data

Need to ID states from mixed location data
Need to search for 50 states abbreviations & 50 states full names, and return state abbreviation
N <- 1:10
Loc <- c("Los Angeles, CA", "Manhattan, NY", "Florida, USA", "Chicago, IL" , "Houston, TX",
+ "Texas, USA", "Corona, CA", "Georgia, USA", "WV NY NJ", "qwerty uy PO DOPL JKF" )
df <- data.frame(N, Loc)
> # Objective create variable state such
> # state contains abbreviated names of states from Loc:
> # for "Los Angeles, CA", state = CA
> # for "Florida, USA", sate = FL
> # for "WV NY NJ", state = NA
> # for "qwerty NJuy PO DOPL JKF", sate = NA (inspite of containing the srting NJ, it is not wrapped in spaces)
>
# End result should be Newdf
State <- c("CA", "NY", "FL", "IL", "TX","TX", "CA", "GA", NA, NA)
Newdf <- data.frame(N, Loc, State)
> Newdf
N Loc State
1 1 Los Angeles, CA CA
2 2 Manhattan, NY NY
3 3 Florida, USA FL
4 4 Chicago, IL IL
5 5 Houston, TX TX
6 6 Texas, USA TX
7 7 Corona, CA CA
8 8 Georgia, USA GA
9 9 WV NY NJ <NA>
10 10 qwerty uy PO DOPL JKF <NA>
Is there a package? or can a loop be written? Even if the schema could be demonstrated with a few states, that would be sufficient - I will post the full solution when I get to it. Btw, this is for a Twitter dataset downloaded using rtweet package, and the variable is: place_full_name
There are default constants in R, state.abb and state.name which can be used.
vars <- stringr::str_extract(df$Loc, paste0('\\b',c(state.abb, state.name),
'\\b', collapse = '|'))
#[1] "CA" "NY" "Florida" "IL" "TX" "Texas" "CA" "Georgia" "WV" NA
If you want everything as abbreviations, we can go further and do :
inds <- vars %in% state.name
vars[inds] <- state.abb[match(vars[inds], state.name)]
vars
#[1] "CA" "NY" "FL" "IL" "TX" "TX" "CA" "GA" "WV" NA
However, we can see that in 9th row you expect output as NA but here it returns "WV" because it is a state name. In such cases, you need to prepare rules which are strict enough so that it only extracts state names and nothing else.
Utilising the built-in R constants, state.abb and state.name, we can try to extract these from the Loc with regular expressions.
state.abbs <- sub('.+, ([A-Z]{2})', '\\1', df$Loc)
state.names <- sub('^(.+),.+', '\\1', df$Loc)
Now if the state abbreviations are not in any of the built-in ones, then we can use match to find the positions of our state.names that are in any of the items in the built-in state.name vector, and use that to index state.abb, else keep what we already have. Those that don't match either return NA.
df$state.abb <- ifelse(!state.abbs %in% state.abb,
state.abb[match(state.names, state.name)], state.abbs)
df
N Loc state.abb
1 1 Los Angeles, CA CA
2 2 Manhattan, NY NY
3 3 Florida, USA FL
4 4 Chicago, IL IL
5 5 Houston, TX TX
6 6 Texas, USA TX
7 7 Corona, CA CA
8 8 Georgia, USA GA
9 9 WV NY NJ <NA>
10 10 qwerty uy PO DOPL JKF <NA>

How to clean the city and state(both full and abbreviation) using R

I have a list of uncleaned city and state from "Location" in twitter, for example:
location <- c("the Great Lake State", "PA", "Harrisburg, Pennsylvania",
"Pennsylvania", "MI", "Detroit,MI")
How to clean the data to make a clean list of two columns with city and state?
You can do this:
splitted_list <- strsplit(location,",")
wide_matrix <- sapply(splitted_list,function(x) c(rep(NA,length(x)==1),x))
res <- setNames(data.frame(t(wide_matrix),stringsAsFactors = FALSE),c("city","state"))
res
# city state
# 1 <NA> the Great Lake State
# 2 <NA> PA
# 3 Harrisburg Pennsylvania
# 4 <NA> Pennsylvania
# 5 <NA> MI
# 6 Detroit MI
Assuming your data (location) is already part of a data.frame which you want to clean up, then tidyr::separate can be suitable option.
location <- c("the Great Lake State", "PA", "Harrisburg, Pennsylvania",
"Pennsylvania", "MI", "Detroit,MI")
library(tidyverse)
as.data.frame(location) %>% # I created a data.frame, which is not needed in actual data
tidyr::separate(location, c("City", "State"), sep=",", fill="left")
# City State
# 1 <NA> the Great Lake State
# 2 <NA> PA
# 3 Harrisburg Pennsylvania
# 4 <NA> Pennsylvania
# 5 <NA> MI
# 6 Detroit MI

R: Mission impossible? How to assign "New York" to a county

I run into problems assigning a county to some city places. When querying via the acs package
> geo.lookup(state = "NY", place = "New York")
state state.name county.name place place.name
1 36 New York <NA> NA <NA>
2 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city
3 36 New York Oneida County 51011 New York Mills village
, you can see that "New York", for instance, has a bunch of counties. So do Los Angeles, Portland, Oklahoma, Columbus etc. How can such data be assigned to a "county"?
Following code is currently used to match "county.name" with the corresponding county FIPS code. Unfortunately, it only works for cases of only one county name output in the query.
Script
dat <- c("New York, NY","Boston, MA","Los Angeles, CA","Dallas, TX","Palo Alto, CA")
dat <- strsplit(dat, ",")
dat
library(tigris)
library(acs)
data(fips_codes) # FIPS codes with state, code, county information
GeoLookup <- lapply(dat,function(x) {
geo.lookup(state = trimws(x[2]), place = trimws(x[1]))[2,]
})
df <- bind_rows(GeoLookup)
#Rename cols to match
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
# Here is a problem, because it works with one item in "county.name" but not more than one (see output below).
df <- df %>% left_join(fips_codes, by = c("state.name", "county.name"))
df
Returns:
state state.name county.name place place.name state.abb statefips countyfips
1 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city <NA> <NA> <NA>
2 25 Massachusetts Suffolk County 7000 Boston city MA 25 025
3 6 California Los Angeles County 20802 East Los Angeles CDP CA 06 037
4 48 Texas Collin County, Dallas County, Denton County, Kaufman County, Rockwall County 19000 Dallas city <NA> <NA> <NA>
5 6 California San Mateo County 20956 East Palo Alto city CA 06 081
In order to retain data, the left_join might better be matched as "look for county.name that contains place.name (without the appending xy city in the name), or choose the first item by default. It would be great to see how this could be done.
In general: I assume, there's no better way than this approach?
Thanks for your help!
What about something like the code below to create a "long" data frame for joining. We use the tidyverse pipe operator to chain operations. strsplit returns a list, which we unnest to stack the list values (the county names that go with each combination of state.name and place.name) into a long data frame where each county.name now gets its own row.
library(tigris)
library(acs)
library(tidyverse)
dat = geo.lookup(state = "NY", place = "New York")
state state.name county.name place place.name
1 36 New York <NA> NA <NA>
2 36 New York Bronx County, Kings County, New York County, Queens County, Richmond County 51000 New York city
3 36 New York Oneida County 51011 New York Mills village
dat = dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest
state state.name place place.name county.name
<chr> <chr> <int> <chr> <chr>
1 36 New York NA <NA> <NA>
2 36 New York 51000 New York city Bronx County
3 36 New York 51000 New York city Kings County
4 36 New York 51000 New York city New York County
5 36 New York 51000 New York city Queens County
6 36 New York 51000 New York city Richmond County
7 36 New York 51011 New York Mills village Oneida County
UPDATE: Regarding the second question in your comment, assuming you have the vector of metro areas already, how about this:
dat <- c("New York, NY","Boston, MA","Los Angeles, CA","Dallas, TX","Palo Alto, CA")
df <- map_df(strsplit(dat, ", "), function(x) {
geo.lookup(state = x[2], place = x[1])[-1, ] %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest
})
df
state state.name place place.name county.name
1 36 New York 51000 New York city Bronx County
2 36 New York 51000 New York city Kings County
3 36 New York 51000 New York city New York County
4 36 New York 51000 New York city Queens County
5 36 New York 51000 New York city Richmond County
6 36 New York 51011 New York Mills village Oneida County
7 25 Massachusetts 7000 Boston city Suffolk County
8 25 Massachusetts 7000 Boston city Suffolk County
9 6 California 20802 East Los Angeles CDP Los Angeles County
10 6 California 39612 Lake Los Angeles CDP Los Angeles County
11 6 California 44000 Los Angeles city Los Angeles County
12 48 Texas 19000 Dallas city Collin County
13 48 Texas 19000 Dallas city Dallas County
14 48 Texas 19000 Dallas city Denton County
15 48 Texas 19000 Dallas city Kaufman County
16 48 Texas 19000 Dallas city Rockwall County
17 48 Texas 40516 Lake Dallas city Denton County
18 6 California 20956 East Palo Alto city San Mateo County
19 6 California 55282 Palo Alto city Santa Clara County
UPDATE 2: If I understand your comments, for cities (actually place names in the example) with more than one county, we want only the county that includes the same name as the city (for example, New York County in the case of New York city), or the first county in the list otherwise. The following code selects a county with the same name as the city or, if there isn't one, the first county for that city. You might have to tweak it a bit to make it work for the entire U.S. For example, for it to work for Louisiana, you might need gsub(" County| Parish"... instead of gsub(" County"....
map_df(strsplit(dat, ", "), function(x) {
geo.lookup(state = x[2], place = x[1])[-1, ] %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
slice(max(1, which(grepl(sub(" [A-Za-z]*$","", place.name), gsub(" County", "", county.name))), na.rm=TRUE))
})
state state.name place place.name county.name
<chr> <chr> <int> <chr> <chr>
1 36 New York 51000 New York city New York County
2 36 New York 51011 New York Mills village Oneida County
3 25 Massachusetts 7000 Boston city Suffolk County
4 6 California 20802 East Los Angeles CDP Los Angeles County
5 6 California 39612 Lake Los Angeles CDP Los Angeles County
6 6 California 44000 Los Angeles city Los Angeles County
7 48 Texas 19000 Dallas city Dallas County
8 48 Texas 40516 Lake Dallas city Denton County
9 6 California 20956 East Palo Alto city San Mateo County
10 6 California 55282 Palo Alto city Santa Clara County
Could you prep the data by using something like the below code?
new_york_data <- geo.lookup(state = "NY", place = "New York")
prep_data <- function(full_data){
output <- data.frame()
for(row in 1:nrow(full_data)){
new_rows <- replicateCounty(full_data[row, ])
output <- plyr::rbind.fill(output, new_rows)
}
return(output)
}
replicateCounty <- function(row){
counties <- str_trim(unlist(str_split(row$county.name, ",")))
output <- data.frame(state = row$state,
state.name = row$state.name,
county.name = counties,
place = row$place,
place.name = row$place.name)
return(output)
}
prep_data(new_york_data)
It's a little messy and you'll need the plyr and stringr packages. Once you prep the data, you should be able to join on it

Summing over columns with if statement in R

I have a data that looks like this.
bankname bankid year totass corresbankname1 corresbankloc1 corresdepoin1 corresbankname2 corresbankloc2 corresdepoin2 corresbankname3 corresbankloc3 corresdepoin3
BankA 1 1881 244789 First Bank New York 7250.32 Third National Bank Philadelphia 20218.2 Commercial Bank Philadelphia 29513.4
BankB 2 1881 195755 National Bank Pittsburgh 10243.6 Union Trust Company New York 1851.51 NA NA NA
Bankc 3 1881 107736 Mechanics' Bank New York 13357.8 Wyoming Bank Wilkes-Barre 17761.2 NA NA NA
BankD 4 1881 170600 Commonwealth Bank Philadelphia 3.35 Seventh National Bank Philadelphia 2 NA NA NA
BankE 5 1881 320000 National Bank New York 351266 Mechanics' Bank New York 314012 National Park Bank New York 206580
This can be replicated by
bankname <- c("The Anchor Savings Bank of Pittsburgh","The Arsenal Bank","The Ashley Savings Bank","The Bank of America of Philadelphia","The Bank of Pittsburgh")
bankid <- c( 1, 2, 3, 4, 5)
year<- c( 1881, 1881, 1881, 1881, 1881)
totass <- c(244789, 195755, 107736, 170600, 32000000)
corresbankname1 <- c("First National Bank","National Bank of Commerce","Mechanics' National Bank","Commonwealth National Bank","National Bank of Commerce")
corresbankloc1 <-c("Philadelphia","Pittsburgh","New York","Philadelphia","New York")
corresdepoin1<-c(7250.32,10243.6,13357.8,3.35,351266)
corresbankname2 <- c("Third National Bank","Union Trust Company","Wyoming National Bank","Seventh National Bank","Mechanics' National Bank")
corresbankloc2<-c("New York","New York","Wilkes-Barre","Philadelphia","New York")
corresdepoin2<-c(20218.2,1851.51,17761.2,2,314012)
corresbankname3<-c("Commercial National Bank",NA,NA,NA,"National Park Bank")
corresbankloc3<-c("Philadelphia",NA,NA,NA,"New York")
corresdepoin3<-c(29513.4,NA,NA,NA,206580)
bankdata<-data.frame(bankname, bankid,year,totass,corresbankname1,corresbankloc1,corresdepoin1,corresbankname2,corresbankloc2,corresdepoin2,corresbankname3,corresbankloc3,corresdepoin3)
This dataset shows the amount(corresdepoin) each bank invested in other banks (corresbankname) and its location (corresbankloc). I have 43 corresbankname, corresbankloc and corresdepoin variables.
Since these banks invest in multiple banks in the same city, I would like to know the total amount of investments in each city. Hence, I would like to generate a new column variable called "total_New York" and sum amount indicated by corresdepoin if correspobankloc is New York. How can I loop over 43 variables?
For instance, BankE has $351266 (corresdepoin1) at National Bank(corresbankname1) in New York(corresbankloc1), $314012 at Mechenics' Bank in New York, and $206580 at National Park Bank in New York. I want a new column called "total deposits in New York" showing that the total amount of investments in banks located in New York is $871858. Hence, what I want is a conditional statement that loops over columns (corresponbankloc) and checks if this is New York or others, and then sum associated values in "corresdepoin" to get aggregated amount of total investment in that city for each "bankname."
Also, in stata, if I want to do this for multiple cities, I would generate
local cities "New York" "Philadelphia" "Pittsburgh"
and loop over them. Is there a similar function in R?
Thank you in advance.
Another option would be to reshape the dataset. Using dplyr. You may create a function to output a subset of cities or for the whole city in the data. I don't know if this is efficient.
library(dplyr)
fun1 <- function(data, city, byloc = TRUE, allcity = TRUE) {
data1 <- reshape(data, idvar = "bankname", varying = list(grep("corresdepoin",
colnames(data)), grep("corresbankloc", colnames(data))), timevar = "Bankloc",
direction = "long", v.names = c("corresdepoin", "corresbankloc"))
data1 <- data1[!is.na(data1$corresbankloc), ]
row.names(data1) <- 1:nrow(data1)
funlocorNot <- function(data, city, grouploc = TRUE) {
dataF <- data %>%
filter(corresbankloc %in% city)
if (grouploc) {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
funallCity <- function(data, grouploc = TRUE) {
if (grouploc) {
dataF1 <- data %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- data %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
if (!allcity) {
if (byloc) {
funlocorNot(data1, city, TRUE)
}
else {
funlocorNot(data1, city, FALSE)
}
}
else {
if (byloc) {
funallCity(data1, TRUE)
}
else {
funallCity(data1, FALSE)
}
}
}
as.data.frame(fun1(bankdata, "New York", byloc=TRUE, allcity=FALSE))
# bankname corresbankloc Bankloc
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 2
#2 The Arsenal Bank Totalbylocation_New York 2
#3 The Ashley Savings Bank Totalbylocation_New York 1
#4 The Bank of Pittsburgh Totalbylocation_New York 1
#5 The Bank of Pittsburgh Totalbylocation_New York 2
#6 The Bank of Pittsburgh Totalbylocation_New York 3
# Total
#1 20218.20
#2 1851.51
#3 13357.80
#4 351266.00
#5 314012.00
#6 206580.00
as.data.frame(fun1(bankdata, "New York", byloc=FALSE, allcity=FALSE))
# bankname corresbankloc Total
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 20218.20
#2 The Arsenal Bank Totalbylocation_New York 1851.51
#3 The Ashley Savings Bank Totalbylocation_New York 13357.80
#4 The Bank of Pittsburgh Totalbylocation_New York 871858.00
as.data.frame(fun1(bankdata, c("New York", "Pittsburgh"), byloc=FALSE, allcity=FALSE))
as.data.frame(fun1(bankdata, byloc=TRUE, allcity=TRUE))

Resources