Replace all partial string entries with NA - r

I have a data frame similar to:
df<-as.data.frame(cbind(rep("Canada",6),
c(rep("Alberta",3), rep("Manitoba",2),rep("Unknown_province",1)),
c("Edmonton", "Unknown_city","Unknown_city","Brandon","Unknown_city","Unknown_city")))
colnames(df)<- c("Country","Province","City")
I would like to substitute all entries that contain "Unknown" with NA.
I have tried using grepl, but it removes all entries for that variable if one entry matches, I would like to only replace individual cells.
df[grepl("Unknown", df, ignore.case=TRUE)] <- NA

df1 <- df # This is to ensure that we can refert back to df incase there is an issue
Then you could use any of the following:
is.na(df1) <- array(grepl('Unknown', as.matrix(df1)), dim(df1))
df1
Country Province City
1 Canada Alberta Edmonton
2 Canada Alberta <NA>
3 Canada Alberta <NA>
4 Canada Manitoba Brandon
5 Canada Manitoba <NA>
6 Canada <NA> <NA>
or even:
df1[] <- sub("Unknown.*", NA, as.matrix(df1), ignore.case = TRUE)
df1
Country Province City
1 Canada Alberta Edmonton
2 Canada Alberta <NA>
3 Canada Alberta <NA>
4 Canada Manitoba Brandon
5 Canada Manitoba <NA>
6 Canada <NA> <NA>
Note that grepl and even sub are vectorized hence no need to use the *aply family or even for loops

Here is one possible way to solve your problem:
df[] <- lapply(df, function(x) ifelse(grepl("Unknown", x, TRUE), NA, x))
df
# Country Province City
# 1 Canada Alberta Edmonton
# 2 Canada Alberta <NA>
# 3 Canada Alberta <NA>
# 4 Canada Manitoba Brandon
# 5 Canada Manitoba <NA>
# 6 Canada <NA> <NA>

Using dplyr
library(dplyr)
library(stringr)
df %>%
mutate(across(everything(),
~ case_when(str_detect(., 'Unknown', negate = TRUE) ~ .)))
Country Province City
1 Canada Alberta Edmonton
2 Canada Alberta <NA>
3 Canada Alberta <NA>
4 Canada Manitoba Brandon
5 Canada Manitoba <NA>
6 Canada <NA> <NA>

I like to use replace() in such cases in which values in a vector are replaced or left as is, depending on a condition :
library(dplyr)
library(stringr)
df%>%mutate(across(everything(), ~replace(.x, str_detect(.x, 'Unknown'), NA)))
Country Province City
1 Canada Alberta Edmonton
2 Canada Alberta <NA>
3 Canada Alberta <NA>
4 Canada Manitoba Brandon
5 Canada Manitoba <NA>
6 Canada <NA> <NA>

df[]<- lapply(df, gsub, pattern = "Unknown", replacement = NA, fixed = TRUE)

Related

Find city, state and country information from a location string in R

I have a string vector with location information. Here is the part of my string
location_information = c("Hartville, Ohio","Malaysia,Johor Bahru","Culpeper, irginia",
"MD", "Atlanta","Granada Hills CA","Kansas City, mo")
With this string vector, I wanted to get the city, state, and country information. Here is the desired output for the sample.
desired_out = data.frame( Country = c("US","Malaysia",rep("US",5)),
State = c("Ohio",NA,"Virginia","Maryland","Georgia","California","Missouri"),
City = c("Hartville","Johor Bahru","Culpeper",NA, "Atlanta","Granada Hills","Kansas City"))
How can I get that information with the consistent string format?
I think I may need to use Google API or something. How can I do it in R?
Here is a solution using the geocoding from openstreetmap to get needed additional information.
Note that you (probably) will not be able to parse hunderds/thousands of locations in one go.
library(tmap)
library(tmaptools)
library(dplyr)
# sample data of locations
location_information = c("Hartville, Ohio","Malaysia,Johor Bahru","Culpeper, Virginia",
"MD", "Atlanta","Granada Hills CA","Kansas City, mo")
# geocode the locations
loc.data <- tmaptools::geocode_OSM(location_information, as.sf = TRUE)
# reverse geocode the locations for additional OSM data
tmaptools::rev_geocode_OSM(loc.data) %>%
dplyr::select(country, state, city, town, village, city_district)
# country state city town village city_district
# 1 United States Ohio <NA> <NA> Hartville <NA>
# 2 Malaysia Johor Johor Bahru <NA> <NA> <NA>
# 3 United States Virginia <NA> Culpeper <NA> <NA>
# 4 United States Maryland <NA> <NA> <NA> <NA>
# 5 United States Georgia Atlanta <NA> <NA> <NA>
# 6 United States California Los Angeles <NA> <NA> Granada Hills
# 7 United States Missouri Kansas City <NA> <NA> <NA>

Make duplicate rows but change specific characters to lowercase in duplicates - R

I have a dataframe that looks like this:
+------------+
|site |
+------------+
|JPN Tokyo |
|AUS Sydney |
|CHN Beijing |
But I'd like to make duplicate rows of the existing rows but with the 2nd and 3rd character changed to lowercase such that the dataframe becomes like this:
+------------+
|site |
+------------+
|JPN Tokyo |
|Jpn Tokyo |
|AUS Sydney |
|Aus Sydney |
|CHN Beijing |
|Chn Beijing |
Would anyone have an idea how to do that?
We expand the rows with uncount, then create a logical condition with duplicated on the 'site', replace the substring values to lower case using sub within case_when
library(dplyr)
library(tidyr)
library(stringr)
df1 <- df1 %>%
uncount(2) %>%
mutate(site = case_when(duplicated(site)
~ sub("^(.)(\\w+)", "\\1\\L\\2", site, perl = TRUE),
TRUE ~ site))
-output
df1
# A tibble: 6 x 1
site
<chr>
1 JPN Tokyo
2 Jpn Tokyo
3 AUS Sydney
4 Aus Sydney
5 CHN Beijing
6 Chn Beijing
data
df1 <- structure(list(site = c("JPN Tokyo", "AUS Sydney", "CHN Beijing"
)), class = "data.frame", row.names = c(NA, -3L))
edit: #AnilGoyal suggested the use of map_dfr, that reduced the call to only one line.
library(tidyverse)
data <-
tribble(
~site,
'JPN Tokyo',
'AUS Sydney',
'CHN Beijing' )
#option1
map_dfr(data$site, ~list(sites = c(.x, str_to_title(.x))))
#> # A tibble: 6 x 1
#> sites
#> <chr>
#> 1 JPN Tokyo
#> 2 Jpn Tokyo
#> 3 AUS Sydney
#> 4 Aus Sydney
#> 5 CHN Beijing
#> 6 Chn Beijing
#option2
map(data$site, ~rbind(.x, str_to_title(.x))) %>%
reduce(rbind) %>%
tibble(site = .)
#> # A tibble: 6 x 1
#> site[,1]
#> <chr>
#> 1 JPN Tokyo
#> 2 Jpn Tokyo
#> 3 AUS Sydney
#> 4 Aus Sydney
#> 5 CHN Beijing
#> 6 Chn Beijing
Created on 2021-06-08 by the reprex package (v2.0.0)
You can use substr to replace characters at specific position.
df1 <- df
substr(df1$site, 2, 3) <- tolower(substr(df1$site, 2, 3))
df1
# site
#1 Jpn Tokyo
#2 Aus Sydney
#3 Chn Beijing
res <- rbind(df1, df)
res[order(res$site), , drop = FALSE]
# site
#2 Aus Sydney
#5 AUS Sydney
#3 Chn Beijing
#6 CHN Beijing
#1 Jpn Tokyo
#4 JPN Tokyo

How to re-order the columns after splitting it?

I have a data frame that contains the list of countries and it has been split using the csplit function.
The code is as follows:-
df <- data.frame(country = c("India, South Africa", "United Kingdom, United States, India",
"England, Australia, South Africa, Germany, United States"))
splitstackshape::cSplit(df, "country", sep = ", ")
# country_1 country_2 country_3 country_4 country_5
#1: India South Africa <NA> <NA> <NA>
#2: United Kingdom United States India <NA> <NA>
#3: England Australia South Africa Germany United States
I wish to rearrange the columns in a such a manner that country_1 column should contain either United States or <NA>. Similarly for country_2 and country_3, it should be India or <NA> and United Kingdom or <NA> respectively. From column_4 on wards, it can follow the order as it is in the row.
Expected output is as follows,
#Expected Output
# country_1 country_2 country_3 country_4 country_5 country_6 country_7
#1 <NA> India <NA> South Africa <NA> <NA> <NA>
#2 United States India United Kingdom <NA> <NA> <NA> <NA>
#3 United States <NA> <NA> England Australia South Africa Germany
A very ugly solution using apply :
df1 <- splitstackshape::cSplit(df, "country", sep = ", ")
n <- length(unique(na.omit(unlist(df1))))
as.data.frame(t(apply(df1, 1, function(x) {
x1 <- rep(NA, n)
if(any(x == 'United States', na.rm = TRUE)) x1[1] <- 'United States'
if(any(x == 'India', na.rm = TRUE)) x1[2] <- 'India'
if(any(x == 'United Kingdom', na.rm = TRUE)) x1[3] <- 'United Kingdom'
temp <- setdiff(x, x1)
if(length(temp)) x1[4:(4 + length(temp) - 1)] <- temp
x1
})))
# V1 V2 V3 V4 V5 V6 V7
#1 <NA> India <NA> South Africa <NA> <NA> <NA>
#2 United States India United Kingdom <NA> <NA> <NA> <NA>
#3 United States <NA> <NA> England Australia South Africa Germany

Add new column if range of columns contains string in R

I have a dataframe like below. I would like to add 2 columns:
ContainsANZ: Indicates if any of the columns from F0 to F3 contain 'Australia' or 'New Zealand' ignoring NA values
AllANZ: Indicates if all non NA columns contain 'Australia' or 'New Zealand'
Starting dataframe would be:
dfContainsANZ
Col.A Col.B Col.C F0 F1 F2 F3
1 data 0 xxx Australia Singapore <NA> <NA>
2 data 1 yyy United States United States United States <NA>
3 data 0 zzz Australia Australia Australia Australia
4 data 0 ooo Hong Kong London Australia <NA>
5 data 1 xxx New Zealand <NA> <NA> <NA>
The end result should look like this:
df
Col.A Col.B Col.C F0 F1 F2 F3 ContainsANZ AllANZ
1 data 0 xxx Australia Singapore <NA> <NA> Australia undefined
2 data 1 yyy United States United States United States <NA> undefined undefined
3 data 0 zzz Australia Australia Australia Australia Australia Australia
4 data 0 ooo Hong Kong London Australia <NA> Australia undefined
5 data 1 xxx New Zealand <NA> <NA> <NA> New Zealand New Zealand
I'm using dplyr (preferred solution) and have come up with a code like this which doesn't work and is very repetitive. Is there a better way to write this so that I am not having to copy F0|F1|F2... rules over again? My real data set has more. Is NAs interfering with the code?
df <- df %>%
mutate(ANZFlag =
ifelse(
F0 == 'Australia' |
F1 == 'Australia' |
F2 == 'Australia' |
F3 == 'Australia',
'Australia',
ifelse(
F0 == 'New Zealand' |
F1 == 'New Zealand' |
F2 == 'New Zealand' |
F3 == 'New Zealand',
'New Zealand', 'undefined'
)
)
)
Still some typing, but I think this gets at the essence you're looking for:
library(dplyr)
df <- read.table(text='Col.A,Col.B,Col.C,F0,F1,F2,F3
data,0,xxx,Australia,Singapore,NA,NA
data,1,yyy,"United States","United States","United States",NA
data,0,zzz,Australia,Australia,Australia,Australia
data,0,ooo,"Hong Kong",London,Australia,NA
data,1,xxx,"New Zealand",NA,NA,NA', header=TRUE, sep=",", stringsAsFactors=FALSE)
down_under <- function(x) {
mtch <- c("Australia", "New Zealand")
cols <- unlist(x)[c("F0", "F1", "F2", "F3")]
bind_cols(x, data_frame(ContainsANZ=any(mtch %in% cols, na.rm=TRUE),
AllANZ=all(as.vector(na.omit(cols)) %in% cols)))
}
rowwise(df) %>% do(down_under(.))
## Source: local data frame [5 x 9]
## Groups: <by row>
##
## Col.A Col.B Col.C F0 F1 F2 F3 ContainsANZ AllANZ
## (chr) (int) (chr) (chr) (chr) (chr) (chr) (lgl) (lgl)
## 1 data 0 xxx Australia Singapore NA NA TRUE TRUE
## 2 data 1 yyy United States United States United States NA FALSE TRUE
## 3 data 0 zzz Australia Australia Australia Australia TRUE TRUE
## 4 data 0 ooo Hong Kong London Australia NA TRUE TRUE
## 5 data 1 xxx New Zealand NA NA NA TRUE TRUE

Extracting parts of data.frame

I have an issue while extracting and creating a new data.frame on the basis of previous one.
So we have:
> head(data.raw)
date id contacted contacted_again region
1 2015-11-29 234 CHAT EMAIL APAC
2 2015-11-29 234 EMAIL EMAIL APAC
3 2015-11-27 257 PHONE PHONE EMEA
4 2015-11-27 278 PHONE EMAIL APAC
5 2015-11-27 293 CHAT EMAIL EMEA
6 2015-11-27 243 EMAIL EMAIL EMEA
market
1 AU/NZ
2 SE Asia (English)
3 Spain
4 China Mainland
5 DACH
6 DACH
However, one I write
data.ru <- data.raw[data.raw$market=="Russia",]
I receive the following mess:
date id contacted contacted_again region market
67 2015-11-25 334 CHAT EMAIL EMEA Russia
NA <NA> <NA> <NA> <NA> <NA> <NA>
NA.1 <NA> <NA> <NA> <NA> <NA> <NA>
NA.2 <NA> <NA> <NA> <NA> <NA> <NA>
NA.3 <NA> <NA> <NA> <NA> <NA> <NA>
NA.4 <NA> <NA> <NA> <NA> <NA> <NA>
How should I write a command to receive just a normal data.frame with all rows that $market=="Russia" without any NAs?
I would just use the subset function.
test <- data.frame(x = c("USA", "USA", "USA", "Russia", "Russia", NA), y = c("Orlando", "Boston", "Memphis", NA, "St. Petersburg", "Mexico City"))
print(test)
x y
1 USA Orlando
2 USA Boston
3 USA Memphis
4 Russia <NA>
5 Russia St. Petersburg
6 <NA> Mexico City
subset(test, x == "Russia")
x y
4 Russia <NA>
5 Russia St. Petersburg
You may want to try: data.ru <- data.raw[data.raw$market %in% "Russia",]
Explanation: I am assuming you have empty lines in your dataset, which are read as NAs (missing value). Since R cannot know if a given NA is equal to "Russia" or not, the generated data frame includes them.
Illustration in code:
# create sample dataset
example.df <- data.frame(market=c(NA, "Russia", NA), outcome = c(1,2,3))
# match market using ==
example.df$market == "Russia"
example.df[example.df$market == "Russia",]
# match market using %in%
example.df$market %in% "Russia"
example.df[example.df$market %in% "Russia",]

Resources