extract country names (or other entity) from column - r

I have a data.frame containing countries and cities in the column location, and I want to extract the former by matching with the world.cities$country.etc dataframe from library(maps) (or any other collection of country names).
Consider this example:
df <- data.frame(location = c("Aarup, Denmark",
"Switzerland",
"Estonia: Aaspere"),
other_col = c(2,3,4))
I attempted using this code
df %>% extract(location,
into = c("country", "rest_location"),
remove = FALSE,
function(x) x[which x %in% world.cities$country.etc])
But am not successful; I expect something like this:
location other_col country rest_location
1 Aarup, Denmark 2 Denmark Aarup,
2 Switzerland 3 Switzerland
3 Estonia: Aaspere 4 Estonia : Aaspere

We can create a pattern of all country names by pasting them together and use str_extract_all to get all the country names which match the pattern in location and remove the words which match the country names to get rest_location.
library(maps)
library(stringr)
all_countries <- str_c(unique(world.cities$country.etc), collapse = "|")
df$country <- sapply(str_extract_all(df$location, all_countries), toString)
df$rest_location <- str_remove_all(df$location, all_countries)
#OR can also do
#df$rest_location <- str_remove_all(df$location, df$country)
df
# location other_col country rest_location
#1 Aarup, Denmark 2 Denmark Aarup,
#2 Switzerland 3 Switzerland
#3 Estonia: Aaspere 4 Estonia : Aaspere
Using sapply and toString for country because if there are more than one country names in location they all are concatenated in one string.

you can try this as a starting point
library(tidyverse)
df %>%
rownames_to_column() %>%
separate_rows(location) %>%
mutate(gr = location %in% world.cities$country.etc) %>%
mutate(gr = ifelse(gr, "country", "rest_location")) %>%
spread(gr, location) %>%
right_join(df %>%
rownames_to_column(),
by = c("rowname", "other_col")) %>%
select(location, other_col, country, rest_location)
location other_col country rest_location
1 Aarup, Denmark 2 Denmark Aarup
2 Switzerland 3 Switzerland <NA>
3 Estonia: Aaspere 4 Estonia Aaspere
Of note, this only works if there are only two "words" in the location column. If necessary you have to specify a suitable separate e.g. sep=",|:"

Base R (not including maps package):
# Import the library:
library(maps)
# Split the string on the spaces:
country_city_vec <- strsplit(df$location, "\\s+")
# Replicate the other col's rows by the split string vec:
rolled_out_df <- data.frame(other_col = rep(df$other_col, sapply(country_city_vec, length)),
location = gsub("[[:punct:]]", "", unlist(country_city_vec)), stringsAsFactors = F)
# Match with the world df:
matched_with_world_df <- merge(df,
setNames(rolled_out_df[rolled_out_df$location %in% world.cities$country.etc,],
c("other_col", "country")),
by = "other_col", all.x = T)
# Extract the city/location drilldown:
matched_with_world_df$rest_location <- trimws(gsub("[[:punct:]]",
"",
gsub(paste0(matched_with_world_df$country,
collapse = "|"),
"", matched_with_world_df$location)), "both")

Related

Assign a conditional value to new created column

My Data frame looks like this
Now, I want to add a new column which assigns one (!) specific value to each country. That means, there is only one value for Australia, one for Canada etc. for every year.
It should look like this:
Year Country R Ineq Adv NEW_COL
2018 Australia R1 Ineq1 1 x_Australia
2019 Australia R2 Ineq2 1 x_Australia
1972 Canada R1 Ineq1 1 x_Canada
...
Is there a smart way to do this?
Appreciate any help!
you use merge.
x = data.frame(country = c("AUS","CAN","AUS","USA"),
val1 = c(1:4))
y = data.frame(country = c("AUS","CAN","USA"),
val2 = c("a","b","c"))
merge(x,y)
country val1 val2
1 AUS 1 a
2 AUS 3 a
3 CAN 2 b
4 USA 4 c
You just manually create the (probably significantly smaller!) reference table that then gets duplicated in the original table in the merge. As you can see, my 3 row table (with a,b,c) is correctly duplicated up to the original (4 row) table such that every AUS gets "a".
You may use mutate and case_when from the package dplyr:
library(dplyr)
data <- data.frame(country = rep(c("AUS", "CAN"), each = 2))
data <- mutate(data,
newcol = case_when(
country == "CAN" ~ 1,
country == "AUS" ~ 2))
print(data)
You can use mutate and group_indices:
library(dplyr)
Sample data:
sample.df <- data.frame(Year = sample(1971:2019, 10, replace = T),
Country = sample(c("AUS", "Can", "UK", "US"), 10, replace = T))
Create new variable called ID, and assign unique ID to each Country group:
sample.df <- sample.df %>%
mutate(ID = group_indices(., Country))
If you want it to appear as x_Country, you can use paste (as commented):
sample.df <- sample.df %>%
mutate(ID = paste(group_indices(., Country), Country, sep = "_"))

Extract partial string from a dataframe column with many text elements (without Regex)

I have a dataframe with a column full of text. It's hard to find any Regex pattern in it. What I am interested in though is to extract all countries based on a predefined list of countries I already have. I figured it shouldn't be too hard, but cannot find my way around str_extract to do this
Here's a replicable example:
data <- data.frame (text_column = c("I travelled to Germany last year, afterwards I visited Poland"," I enjoyed my vacation in Spain", "The weather in the Netherlands was not great"))
And I have a list of countries
country_vector <- c("Germany", "Poland","the Netherlands","France")
What I want is to match the text column against those countries, extract all matches and pivot them into separate columns for all matches. If no match, then we can enter NA.
So the expected outcome for this is:
Country_1 Country_2
Germany Poland
NA NA
the Netherlands NA
Given that there can be a multitude of countries mentioned in one column the ncol of the final tibble/df needs to be the length of the entry with the maximum number of countries.
Any idea how to tackle this? I've been able to do it for 1 entry but not for my entire dataframe. But I have the feeling I'm working in the wrong direction and there must be an easier solution.
temp <- stringr::str_extract(data$text_column,country_vector) %>%
as.data.frame() %>%
magrittr: set_colnames(c("countries")) %>%
dplyr::filter(!is.na(countries))
if (nrow(temp==0)){
temp <- temp %>% add_row(countries = NA) %>%
mutate(order = paste0("country_",seq(1:nrow(.)))) %>%
pivot_wider(.,names_from = order,values_from = countries)
print(temp)
} else {
temp <- temp %>%
mutate(order = paste0("country_",seq(1:nrow(.)))) %>%
pivot_wider(.,names_from = order,values_from = countries)
print(temp)
}
str_extract_all gives the data that you want :
library(stringr)
str_extract_all(data$text_column, str_c(country_vector, collapse = '|'))
#[[1]]
#[1] "Germany" "Poland"
#[[2]]
#character(0)
#[[3]]
#[1] "the Netherlands"
To get the data in the format that you want you need to manipulate the above output.
library(dplyr)
bind_rows(lapply(
str_extract_all(data$text_column, str_c(country_vector, collapse = '|')),
function(x) if(length(x)) as.data.frame(t(x)) else as.data.frame(t(NA))))
# V1 V2
#1 Germany Poland
#2 <NA> <NA>
#3 the Netherlands <NA>
There is a simplify option in str_extract_all, which returns a matrix. So, we can directly convert to a two column matrix and wrap with as.data.frame (if a data.frame is needed)
library(stringr)
as.data.frame(str_extract_all(data$text_column,
str_c(country_vector, collapse = '|'), simplify = TRUE))
# V1 V2
#1 Germany Poland
#2
#3 the Netherlands

Getting rid of one quote at the beginning of a string

I have a dataframe df and in 1 of the columns (region) one of the strings is "Latin/South America
How would I get rid of the quote at the beginning? I'm not not how to get rid of just one quote.
Thanks in advance
Let's see this example below
library(tidyverse)
library(stringr)
df <- data.frame(obs = 1:2,Region = c('"Latin/South America', '"Asia/Europe'))
df2 <- df %>%
mutate(Region = str_replace_all(Region, regex('^"'), ""))
# obs Region
# 1 1 Latin/South America
# 2 2 Asia/Europe

R Create or Modify a dataframe using dplyr

I am very new programming, and I am learning how to use dplyr, and I am wondering how to solve this problem:
I have this dataframe:
countries <- c("USA","Canada","Denmark","Albania", "Turkey","France", "Italy")
values <- c(1, 1, 3, 3,7,8,9)
old_df <- data.frame(countries, values, stringsAsFactors = FALSE)
I want to modify the order into my dataset to obtain this:
countries <- c("USA , Canada","Denmark , Albania", "Turkey","France", "Italy")
values <- c(1,3,7,8,9)
new_df <- data.frame(countries, values, stringsAsFactors = FALSE)
Because I am using dyplr I think that the best way to solve my problem could be:
library(dplyr)
new_df <- group_by(values) %>%
transmute(countries = countries) %>%
ungroup
Thank you in advance for any clue about how to solve this.
library(dplyr)
old_df %>%
group_by(values) %>%
summarise(countries = paste0(countries, collapse = ", "))
# # A tibble: 5 x 2
# values countries
# <dbl> <chr>
# 1 1 USA, Canada
# 2 3 Denmark, Albania
# 3 7 Turkey
# 4 8 France
# 5 9 Italy
The point here is that for each unique value in values you want to combine some of your rows, so you need to use summarise (i.e. you want to end up with one row per values value).
You can use summarise(countries = paste0(sort(countries), collapse = ", ")) if you want to apply an alphabetical order when you combine countries.

Doing a ranged lookup with multiple variables in a matrix in R

I feel like I have a bit of a complicated problem (or at least for me it is!).
I have a table of prices which will need to be read from a csv which will look exactly like this:
V1 <- c("","Destination","Spain","Spain","Spain","Portugal","Portugal","Portugal","Italy","Italy","Italy")
V2 <- c("","Min_Duration",rep(c(1,3,6),3))
V3 <- c("","Max_Duration",rep(c(2,5,10),3))
V4 <- c("Full-board","Level_1",runif(9,100,200))
V5 <- c("Full-board","Level_2",runif(9,201,500))
V6 <- c("Full-board","Level_3",runif(9,501,1000))
V7 <- c("Half-board","Level_1",runif(9,100,200))
V8 <- c("Half-board","Level_2",runif(9,201,500))
V9 <- c("Half-board","Level_3",runif(9,501,1000))
Lookup_matrix <- as.data.frame(cbind(V1,V2,V3,V4,V5,V6,V7,V8))
The prices in the above table will of course come out a bit strange as they're completely random - but we can ignore that...
I also have a table like this:
Destination <- c("Spain", "Italy", "Portugal")
Duration <- c(2,4,8)
Level <- c(1,3,3)
Board <- c("Half-board","Half-board","Full-board")
Price <- "Empty"
Price_matrix <- as.data.frame(cbind(Destination,Duration,Level,Board,Price))
My question is - how do I populate the 'Price' column of the price matrix with the corresponding prices that can be found in the lookup matrix? Please note that the duration variable of the price matrix will have to fit into a range found between the 'Min_Duration' and 'Max_Duration' columns in the lookup matrix.
In Excel I would use an Index,Match formula. But I'm stumped with R.
Thanks in advance,
Dan
Here is a tidyverse possibility
First, please note that I rename your input objects; both Price_matrix and Lookup_matrix are data.frames (not matrices).
df1 <- Price_matrix
df2 <- Lookup_matrix
Next we need to fix the column names of df2 = Lookup_matrix.
# Fix column names
colnames(df2) <- gsub("^_", "", apply(df2[1:2, ], 2, paste0, collapse = "_"))
df2 <- df2[-(1:2), ]
We now basically do a left join of df1 and df2; in order for df2 to be in a suitable format we spread data from wide to long, extract Price values for every Board and Level, and expand entries from Min_Duration to Max_Duration. Then we join by Destination, Duration, Level and Board.
Note that in your example, Destination = Italy has no Level = 3 entry in Lookup_matrix; we therefore get Price = NA for this entry.
library(tidyverse)
left_join(
df1 %>%
mutate_if(is.factor, as.character) %>%
select(-Price),
df2 %>%
mutate_if(is.factor, as.character) %>%
gather(key, Price, -Destination, -Min_Duration, -Max_Duration) %>%
separate(key, into = c("Board", "Level"), sep = "_", extra = "merge") %>%
mutate(Level = sub("Level_", "", Level)) %>%
rowwise() %>%
mutate(Duration = list(seq(as.numeric(Min_Duration), as.numeric(Max_Duration)))) %>%
unnest() %>%
select(-Min_Duration, -Max_Duration) %>%
mutate(Duration = as.character(Duration)))
#Joining, by = c("Destination", "Duration", "Level", "Board")
# Destination Duration Level Board Price
#1 Spain 2 1 Half-board 119.010942545719
#2 Italy 4 3 Half-board <NA>
#3 Portugal 8 3 Full-board 764.536124917446
Using datatable:
library(data.table)
nms = trimws(do.call(paste, transpose(Lookup_matrix[1:2, ])))# column names
cat(do.call(paste, c(collapse="\n", Lookup_matrix[-(1:2), ])), file = "mm.csv")
# Rewrite the data in the correct format. You do not have to.
# Just doing Lookup_matrix1 = setNames(Lookup_matrix[-(1:2),],nms) is enough
# but it will not have rectified the column classes.
Lookup_matrix1 = fread("mm.csv", col.names = nms)
melt(Lookup_matrix1, 1:3)[,
c("Board", "Level") := .(sub("[.]", "-", sub("\\.Leve.*", "", variable)), sub("\\D+", "", variable))][
Price_matrix[, -5], on=c("Destination", "Board", "Level", "Min_Duration <= Duration", "Max_Duration >= Duration")]
Destination Min_Duration Max_Duration variable value Board Level
1: Spain 2 2 Half.board.Level_1 105.2304 Half-board 1
2: Italy 4 4 <NA> NA Half-board 3
3: Portugal 8 8 Full.board.Level_3 536.5132 Full-board 3

Resources