Previous marking nullified while using ifelse condition in R - r

Here is my data -
library(data.table)
basefile2 = data.table(States = c("California","California", "California", "Texas","Texas","Texas", "Ohio", "Ohio", "Ohio"),
Cities = c("LA", "California City", "San Fran", "Houston", "Dallas", "Austin", "Columbus", "Cleaveland", "Wooster"))
And here is my code -
Market = function(state, city){
if (missing(state)) stop("Enter State",
call. = FALSE)
if (missing(city)) stop("Enter City(ies)",
call. = FALSE)
basefile2 <<- basefile2[, "Consideration" := ifelse(States == state & Cities %in% city, "Y",
ifelse("Consideration" %in% colnames(basefile) & "Consideration" == "Y", "Y", "N"))]
}
Market(state = "California",
city = c("LA", "California City"))
Market(state = "Texas",
city = c("Dallas", "Austin"))
The previous marking in the consideration column when state was California is getting nullified. Yes, I need to input different states in separate functions to due to certain input constraints
Here is my output
States Cities Consideration
1: California LA N
2: California California City N
3: California San Fran N
4: Texas Houston N
5: Texas Dallas Y
6: Texas Austin Y
7: Ohio Columbus N
8: Ohio Cleaveland N
9: Ohio Wooster N
Where as, the output I need is the consideration column must have "Y" in the California City, LA, Austin & Dallas.

One option is to add the "Consideration" column to the data.table at the start, and then use that as a condition to update within the function so that previous updates are not replaced.
library(data.table)
basefile2 <- data.table(...) # as you had
basefile2[, Consideration := "N"] # initialize the column
Market <- function(state, city){
basefile2 <<- basefile2[Consideration=="N", # Only update if this is "N"
"Consideration" := ifelse(States == state & Cities %in% city, "Y", "N")]
}
Or maybe like this:
Market <- function(state, city){
basefile2 <<- basefile2[States == state & Cities %in% city, Consideration := "Y"]
}
Market(state = "California", city = c("LA", "California City"))
Market(state = "Texas", city = c("Dallas", "Austin"))
basefile2
States Cities Consideration
1: California LA Y
2: California California City Y
3: California San Fran N
4: Texas Houston N
5: Texas Dallas Y
6: Texas Austin Y
7: Ohio Columbus N
8: Ohio Cleaveland N
9: Ohio Wooster N

Related

Cannot change colnames inside function in R

Here is my data -
library(data.table)
basefile2 = data.table(States = c("California","California", "California", "Texas","Texas","Texas", "Ohio", "Ohio", "Ohio"),
district = c("district1", "district2", "district3", "district4", "district5", "district6", "district7","district8", "district9"),
Cities = c("LA", "California City", "San Fran", "Houston", "Dallas", "Austin", "Columbus", "Cleaveland", "Wooster"))
And here is my code -
basefile2[, Consideration := "N"] # initialize the column
Market <- function(state, level, market){
for(i in market)
{
basefile2 <<- basefile2[States == state & get(level) %in% i, Level := paste(i)]
}
names(basefile2)[length(names(basefile2))]<- paste0("NEW_ ",level)
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
Here is my output -
States district Cities Consideration Level
1: California district1 LA N LA
2: California district2 California City N California City
3: California district3 San Fran N <NA>
4: Texas district4 Houston N district4
5: Texas district5 Dallas N district5
6: Texas district6 Austin N Austin
7: Ohio district7 Columbus N <NA>
8: Ohio district8 Cleaveland N <NA>
9: Ohio district9 Wooster N <NA>
The output I need is -
The renaming of columns is not happening via my code, I tried using data.table's setnames() too but that too does not give me my required output. Where am I going wrong?
data.table objects are already referential (not semantic), so you don't need to <<- globally assign.
changing names can be done either in-place (no need for "Level" placeholder) or after-the-fact with setnames. The problem with using a placeholder, though, is that within the first call in the for loop, a new column is created (regardless if it already exists) and then setnames(basefile2, "Level", paste0("NEW_", level)) might produce a duplicate name (which is possible but ... odd).
Market <- function(state, level, market){
for (i in market) {
basefile2[States == state & get(level) %in% i,
paste0("NEW_", level) := paste(i) ]
}
}
Market(state = "California", level = "Cities", market = c("LA", "California City"))
Market(state = "Texas", level = "Cities", market = c("Dallas", "Austin"))
Market(state = "Texas", level = "district", market = c("district4", "district5"))
basefile2
# States district Cities Consideration NEW_Cities NEW_district
# 1: California district1 LA N LA <NA>
# 2: California district2 California City N California City <NA>
# 3: California district3 San Fran N <NA> <NA>
# 4: Texas district4 Houston N <NA> district4
# 5: Texas district5 Dallas N Dallas district5
# 6: Texas district6 Austin N Austin <NA>
# 7: Ohio district7 Columbus N <NA> <NA>
# 8: Ohio district8 Cleaveland N <NA> <NA>
# 9: Ohio district9 Wooster N <NA> <NA>

Need to ID states from mixed names /IDs in location data

Need to ID states from mixed location data
Need to search for 50 states abbreviations & 50 states full names, and return state abbreviation
N <- 1:10
Loc <- c("Los Angeles, CA", "Manhattan, NY", "Florida, USA", "Chicago, IL" , "Houston, TX",
+ "Texas, USA", "Corona, CA", "Georgia, USA", "WV NY NJ", "qwerty uy PO DOPL JKF" )
df <- data.frame(N, Loc)
> # Objective create variable state such
> # state contains abbreviated names of states from Loc:
> # for "Los Angeles, CA", state = CA
> # for "Florida, USA", sate = FL
> # for "WV NY NJ", state = NA
> # for "qwerty NJuy PO DOPL JKF", sate = NA (inspite of containing the srting NJ, it is not wrapped in spaces)
>
# End result should be Newdf
State <- c("CA", "NY", "FL", "IL", "TX","TX", "CA", "GA", NA, NA)
Newdf <- data.frame(N, Loc, State)
> Newdf
N Loc State
1 1 Los Angeles, CA CA
2 2 Manhattan, NY NY
3 3 Florida, USA FL
4 4 Chicago, IL IL
5 5 Houston, TX TX
6 6 Texas, USA TX
7 7 Corona, CA CA
8 8 Georgia, USA GA
9 9 WV NY NJ <NA>
10 10 qwerty uy PO DOPL JKF <NA>
Is there a package? or can a loop be written? Even if the schema could be demonstrated with a few states, that would be sufficient - I will post the full solution when I get to it. Btw, this is for a Twitter dataset downloaded using rtweet package, and the variable is: place_full_name
There are default constants in R, state.abb and state.name which can be used.
vars <- stringr::str_extract(df$Loc, paste0('\\b',c(state.abb, state.name),
'\\b', collapse = '|'))
#[1] "CA" "NY" "Florida" "IL" "TX" "Texas" "CA" "Georgia" "WV" NA
If you want everything as abbreviations, we can go further and do :
inds <- vars %in% state.name
vars[inds] <- state.abb[match(vars[inds], state.name)]
vars
#[1] "CA" "NY" "FL" "IL" "TX" "TX" "CA" "GA" "WV" NA
However, we can see that in 9th row you expect output as NA but here it returns "WV" because it is a state name. In such cases, you need to prepare rules which are strict enough so that it only extracts state names and nothing else.
Utilising the built-in R constants, state.abb and state.name, we can try to extract these from the Loc with regular expressions.
state.abbs <- sub('.+, ([A-Z]{2})', '\\1', df$Loc)
state.names <- sub('^(.+),.+', '\\1', df$Loc)
Now if the state abbreviations are not in any of the built-in ones, then we can use match to find the positions of our state.names that are in any of the items in the built-in state.name vector, and use that to index state.abb, else keep what we already have. Those that don't match either return NA.
df$state.abb <- ifelse(!state.abbs %in% state.abb,
state.abb[match(state.names, state.name)], state.abbs)
df
N Loc state.abb
1 1 Los Angeles, CA CA
2 2 Manhattan, NY NY
3 3 Florida, USA FL
4 4 Chicago, IL IL
5 5 Houston, TX TX
6 6 Texas, USA TX
7 7 Corona, CA CA
8 8 Georgia, USA GA
9 9 WV NY NJ <NA>
10 10 qwerty uy PO DOPL JKF <NA>

Extract cell with AND without commas in R

I'm trying to extract the city and state from the Address column into 2 separate columns labeled City and State in r. This is what my data looks like:
df <- data.frame(address = c("Los Angeles, CA", "Pittsburgh PA", "Miami FL","Baltimore MD", "Philadelphia, PA", "Trenton, NJ")) %>%
separate(address, c("City", "State"), sep=",")
I tried using the separate function but that only gets the ones with commas. Any ideas on how to do this for both cases?
There is a pattern at the end (space, letter, letter) which I can use to exploit and then remove any commas but not sure how the syntax would work using grep.
Starting from your df
df <- data.frame(address = c("Los Angeles, CA", "Pittsburgh PA", "Miami FL","Baltimore MD", "Philadelphia, PA", "Trenton, NJ"))
> df
address
1 Los Angeles, CA
2 Pittsburgh PA
3 Miami FL
4 Baltimore MD
5 Philadelphia, PA
6 Trenton, NJ
It's possible to use gsub to subset the string like this:
> city=gsub(',','',gsub("(.*).{3}","\\1",df[,1]))
> city
[1] "Los Angeles" "Pittsburgh" "Miami" "Baltimore" "Philadelphia"
[6] "Trenton"
> state=gsub(".*(\\w{2})","\\1",df[,1])
> state
[1] "CA" "PA" "FL" "MD" "PA" "NJ"
df=data.frame(City=city,State=state)
> df
City State
1 Los Angeles CA
2 Pittsburgh PA
3 Miami FL
4 Baltimore MD
5 Philadelphia PA
6 Trenton NJ
This is a little unorthodox but it works well. It assumes that all states are 2 characters long and that there is at least 1 space between the city and state. Comma's are ignored
df <- data.frame(address = c("Los Angeles, CA", "Pittsburgh PA", "Miami FL","Baltimore MD", "Philadelphia, PA", "Trenton, NJ"))
df$city <- substring(sub(",","",df$address),1,nchar(sub(",","",df$address))-3)
df$state <- substring(as.character(df$address),nchar(as.character(df$address))-1,nchar(as.character(df$address)))
df <- within(df,rm(address))
output:
city state
1 Los Angeles CA
2 Pittsburgh PA
3 Miami FL
4 Baltimore MD
5 Philadelphia PA
6 Trenton NJ

Finding Average of One Column Based on 2 Other Columns RStudio

I currently have a data frame that has three columns (City, State and Income) I wrote an example of the data below...
City State Income
Addison Illinois 71,000
Addison Illinois 101,000
Addison Illinois 81,000
Addison Texas 74,000
As you can see there are repeats of the cities. There are several Addison, IL's because income differs by the zip-code/area of the city.
I want to take the average of all incomes in a given city and state. In this example I want the average of all Addison IL's but NOT including Addison, Texas.
I am looking for this (in this given example)
City State MeanIncome
Addison Illinois 84,333
Addison Texas 74,000
I tried this:
Income_By_City <- aggregate( Income ~ City, df, mean )
But it gave me the average of ALL Addison's, including Texas...
Is there a way to take the average of Income Column, based on City AND State??
I am pretty new to coding, so I'm not sure if this is a simple question. But I would appreciate any help I can get.
df <- data.frame(City = c("Addison", "Addison", "Addison", "Addison"), State = c("Illinois", "Illinois", "Illinois", "Texas"), Income = c(71000, 101000, 81000, 74000))
library(dplyr)
df %>%
group_by(City, State) %>%
summarise(MeanIncome=(mean(Income)))
# City State MeanIncome
#1 Addison Illinois 84333.33
#2 Addison Texas 74000.00
Here is a dplyr solution:
library(tidyverse)
df <- tribble(
~City, ~State, ~Income,
"Addison", "Illinois", 71000,
"Addison", "Illinois", 101000,
"Addison", "Illinois", 81000,
"Addison", "Texas", 74000
)
df %>%
group_by(City, State) %>%
mutate(AverageIncome = mean(Income))
# A tibble: 4 x 4
# Groups: City, State [2]
City State Income AverageIncome
<chr> <chr> <dbl> <dbl>
1 Addison Illinois 71000 84333.33
2 Addison Illinois 101000 84333.33
3 Addison Illinois 81000 84333.33
4 Addison Texas 74000 74000.00

Summing over columns with if statement in R

I have a data that looks like this.
bankname bankid year totass corresbankname1 corresbankloc1 corresdepoin1 corresbankname2 corresbankloc2 corresdepoin2 corresbankname3 corresbankloc3 corresdepoin3
BankA 1 1881 244789 First Bank New York 7250.32 Third National Bank Philadelphia 20218.2 Commercial Bank Philadelphia 29513.4
BankB 2 1881 195755 National Bank Pittsburgh 10243.6 Union Trust Company New York 1851.51 NA NA NA
Bankc 3 1881 107736 Mechanics' Bank New York 13357.8 Wyoming Bank Wilkes-Barre 17761.2 NA NA NA
BankD 4 1881 170600 Commonwealth Bank Philadelphia 3.35 Seventh National Bank Philadelphia 2 NA NA NA
BankE 5 1881 320000 National Bank New York 351266 Mechanics' Bank New York 314012 National Park Bank New York 206580
This can be replicated by
bankname <- c("The Anchor Savings Bank of Pittsburgh","The Arsenal Bank","The Ashley Savings Bank","The Bank of America of Philadelphia","The Bank of Pittsburgh")
bankid <- c( 1, 2, 3, 4, 5)
year<- c( 1881, 1881, 1881, 1881, 1881)
totass <- c(244789, 195755, 107736, 170600, 32000000)
corresbankname1 <- c("First National Bank","National Bank of Commerce","Mechanics' National Bank","Commonwealth National Bank","National Bank of Commerce")
corresbankloc1 <-c("Philadelphia","Pittsburgh","New York","Philadelphia","New York")
corresdepoin1<-c(7250.32,10243.6,13357.8,3.35,351266)
corresbankname2 <- c("Third National Bank","Union Trust Company","Wyoming National Bank","Seventh National Bank","Mechanics' National Bank")
corresbankloc2<-c("New York","New York","Wilkes-Barre","Philadelphia","New York")
corresdepoin2<-c(20218.2,1851.51,17761.2,2,314012)
corresbankname3<-c("Commercial National Bank",NA,NA,NA,"National Park Bank")
corresbankloc3<-c("Philadelphia",NA,NA,NA,"New York")
corresdepoin3<-c(29513.4,NA,NA,NA,206580)
bankdata<-data.frame(bankname, bankid,year,totass,corresbankname1,corresbankloc1,corresdepoin1,corresbankname2,corresbankloc2,corresdepoin2,corresbankname3,corresbankloc3,corresdepoin3)
This dataset shows the amount(corresdepoin) each bank invested in other banks (corresbankname) and its location (corresbankloc). I have 43 corresbankname, corresbankloc and corresdepoin variables.
Since these banks invest in multiple banks in the same city, I would like to know the total amount of investments in each city. Hence, I would like to generate a new column variable called "total_New York" and sum amount indicated by corresdepoin if correspobankloc is New York. How can I loop over 43 variables?
For instance, BankE has $351266 (corresdepoin1) at National Bank(corresbankname1) in New York(corresbankloc1), $314012 at Mechenics' Bank in New York, and $206580 at National Park Bank in New York. I want a new column called "total deposits in New York" showing that the total amount of investments in banks located in New York is $871858. Hence, what I want is a conditional statement that loops over columns (corresponbankloc) and checks if this is New York or others, and then sum associated values in "corresdepoin" to get aggregated amount of total investment in that city for each "bankname."
Also, in stata, if I want to do this for multiple cities, I would generate
local cities "New York" "Philadelphia" "Pittsburgh"
and loop over them. Is there a similar function in R?
Thank you in advance.
Another option would be to reshape the dataset. Using dplyr. You may create a function to output a subset of cities or for the whole city in the data. I don't know if this is efficient.
library(dplyr)
fun1 <- function(data, city, byloc = TRUE, allcity = TRUE) {
data1 <- reshape(data, idvar = "bankname", varying = list(grep("corresdepoin",
colnames(data)), grep("corresbankloc", colnames(data))), timevar = "Bankloc",
direction = "long", v.names = c("corresdepoin", "corresbankloc"))
data1 <- data1[!is.na(data1$corresbankloc), ]
row.names(data1) <- 1:nrow(data1)
funlocorNot <- function(data, city, grouploc = TRUE) {
dataF <- data %>%
filter(corresbankloc %in% city)
if (grouploc) {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- dataF %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
funallCity <- function(data, grouploc = TRUE) {
if (grouploc) {
dataF1 <- data %>%
group_by(bankname, corresbankloc, Bankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
else {
dataF1 <- data %>%
group_by(bankname, corresbankloc) %>%
summarise(Total = sum(corresdepoin, na.rm = TRUE))
}
dataF1[, 2] <- paste("Totalbylocation", dataF1[, 2], sep = "_")
dataF1
}
if (!allcity) {
if (byloc) {
funlocorNot(data1, city, TRUE)
}
else {
funlocorNot(data1, city, FALSE)
}
}
else {
if (byloc) {
funallCity(data1, TRUE)
}
else {
funallCity(data1, FALSE)
}
}
}
as.data.frame(fun1(bankdata, "New York", byloc=TRUE, allcity=FALSE))
# bankname corresbankloc Bankloc
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 2
#2 The Arsenal Bank Totalbylocation_New York 2
#3 The Ashley Savings Bank Totalbylocation_New York 1
#4 The Bank of Pittsburgh Totalbylocation_New York 1
#5 The Bank of Pittsburgh Totalbylocation_New York 2
#6 The Bank of Pittsburgh Totalbylocation_New York 3
# Total
#1 20218.20
#2 1851.51
#3 13357.80
#4 351266.00
#5 314012.00
#6 206580.00
as.data.frame(fun1(bankdata, "New York", byloc=FALSE, allcity=FALSE))
# bankname corresbankloc Total
#1 The Anchor Savings Bank of Pittsburgh Totalbylocation_New York 20218.20
#2 The Arsenal Bank Totalbylocation_New York 1851.51
#3 The Ashley Savings Bank Totalbylocation_New York 13357.80
#4 The Bank of Pittsburgh Totalbylocation_New York 871858.00
as.data.frame(fun1(bankdata, c("New York", "Pittsburgh"), byloc=FALSE, allcity=FALSE))
as.data.frame(fun1(bankdata, byloc=TRUE, allcity=TRUE))

Resources