Swap Misplaced cells in R? - r

I have a huge database (more than 65M of rows) and I noticed that some cells are misplaced. As an example, let's say I have this:
library("tidyverse")
DATA <- tribble(
~SURNAME,~NAME,~STATE,~COUNTRY,
'Smith','Emma','California','USA',
'Johnson','Oliia','Texas','USA',
'Williams','James','USA','California',
'Jones','Noah','Pennsylvania','USA',
'Williams','Liam','Illinois','USA',
'Brown','Sophia','USA','Louisiana',
'Daves','Evelyn','USA','Oregon',
'Miller','Jacob','New Mexico','USA',
'Williams','Lucas','Connecticut','USA',
'Daves','John','California','USA',
'Jones','Carl','USA','Illinois'
)
=====
> DATA
# A tibble: 11 x 4
SURNAME NAME STATE COUNTRY
<chr> <chr> <chr> <chr>
1 Smith Emma California USA
2 Johnson Oliia Texas USA
3 Williams James USA California
4 Jones Noah Pennsylvania USA
5 Williams Liam Illinois USA
6 Brown Sophia USA Louisiana
7 Daves Evelyn USA Oregon
8 Miller Jacob New Mexico USA
9 Williams Lucas Connecticut USA
10 Daves John California USA
11 Jones Carl USA Illinois
As you can see, the Country and State are misplaced in some rows, how can I efficiently swap those ones?
Kind Regards,
Luiz.

Using data.table and the in-built state.name vector:
setDT(DATA)
DATA[COUNTRY %in% state.name, `:=`(COUNTRY = STATE, STATE = COUNTRY)]
DATA
# SURNAME NAME STATE COUNTRY
# 1: Smith Emma California USA
# 2: Johnson Oliia Texas USA
# 3: Williams James California USA
# 4: Jones Noah Pennsylvania USA
# 5: Williams Liam Illinois USA
# 6: Brown Sophia Louisiana USA
# 7: Daves Evelyn Oregon USA
# 8: Miller Jacob New Mexico USA
# 9: Williams Lucas Connecticut USA
# 10: Daves John California USA
# 11: Jones Carl Illinois USA

Check this solution (it assumes that COUNTRY column is in ISO3 format e.g. MEX, CAN):
DATA %>%
mutate(
COUNTRY_TMP = if_else(str_detect(COUNTRY, '[A-Z]{3}'), COUNTRY, STATE),
STATE = if_else(str_detect(COUNTRY, '[A-Z]{3}'), STATE, COUNTRY),
COUNTRY = COUNTRY_TMP
) %>%
select(-COUNTRY_TMP)

Assuming all country names are followed ISO3 format, we can first install the countrycode package. In this package, there is a data frame called codelist with a column iso3c with the ISO3 country names. We can use that as follows to swap the country name.
library(tidyverse)
library(countrycode)
DATA2 <- DATA %>%
mutate(STATE2 = ifelse(STATE %in% codelist$iso3c &
!COUNTRY %in% codelist$iso3c, COUNTRY, STATE),
COUNTRY2 = ifelse(!STATE %in% codelist$iso3c &
COUNTRY %in% codelist$iso3c, COUNTRY, STATE)) %>%
select(-STATE, -COUNTRY) %>%
rename(STATE = STATE2, COUNTRY = COUNTRY2)
DATA2
# # A tibble: 11 x 4
# SURNAME NAME STATE COUNTRY
# <chr> <chr> <chr> <chr>
# 1 Smith Emma California USA
# 2 Johnson Oliia Texas USA
# 3 Williams James California USA
# 4 Jones Noah Pennsylvania USA
# 5 Williams Liam Illinois USA
# 6 Brown Sophia Louisiana USA
# 7 Daves Evelyn Oregon USA
# 8 Miller Jacob New Mexico USA
# 9 Williams Lucas Connecticut USA
# 10 Daves John California USA
# 11 Jones Carl Illinois USA

Related

Is it possible to convert lines from a text file into columns to get a dataframe?

I have a text file containing information on book title, author name, and country of birth which appear in seperate lines as shown below:
Oscar Wilde
De Profundis
Ireland
Nathaniel Hawthorn
Birthmark
USA
James Joyce
Ulysses
Ireland
Walt Whitman
Leaves of Grass
USA
Is there any way to convert the text to a dataframe with these three items appearing as different columns:
ID Author Book Country
1 "Oscar Wilde" "De Profundis" "Ireland"
2 "Nathaniel Hawthorn" "Birthmark" "USA"
There are built-in functions for dealing with this kind of data:
data.frame(scan(text=xx, multi.line=TRUE,
what=list(Author="", Book="", Country=""), sep="\n"))
# Author Book Country
#1 Oscar Wilde De Profundis Ireland
#2 Nathaniel Hawthorn Birthmark USA
#3 James Joyce Ulysses Ireland
#4 Walt Whitman Leaves of Grass USA
You can create a 3-column matrix from one column of data.
dat <- read.table('data.txt', sep = ',')
result <- matrix(dat$V1, ncol = 3, byrow = TRUE) |>
data.frame() |>
setNames(c('Author', 'Book', 'Country'))
result <- cbind(ID = 1:nrow(result), result)
result
# ID Author Book Country
#1 1 Oscar Wilde De Profundis Ireland
#2 2 Nathaniel Hawthorn Birthmark USA
#3 3 James Joyce Ulysses Ireland
#4 4 Walt Whitman Leaves of Grass USA
There aren't any built in functions that handle data like this. But you can reshape your data after importing.
#Test data
xx <- "Oscar Wilde
De Profundis
Ireland
Nathaniel Hawthorn
Birthmark
USA
James Joyce
Ulysses
Ireland
Walt Whitman
Leaves of Grass
USA"
writeLines(xx, "test.txt")
And then the code
library(dplyr)
library(tidyr)
lines <- read.csv("test.txt", header=FALSE)
lines %>%
mutate(
rid = ((row_number()-1) %% 3)+1,
pid = (row_number()-1) %/%3 + 1) %>%
mutate(col=case_when(rid==1~"Author",rid==2~"Book", rid==3~"Country")) %>%
select(-rid) %>%
pivot_wider(names_from=col, values_from=V1)
Which returns
# A tibble: 4 x 4
pid Author Book Country
<dbl> <chr> <chr> <chr>
1 1 Oscar Wilde De Profundis Ireland
2 2 Nathaniel Hawthorn Birthmark USA
3 3 James Joyce Ulysses Ireland
4 4 Walt Whitman Leaves of Grass USA

Make duplicate rows but change specific characters to lowercase in duplicates - R

I have a dataframe that looks like this:
+------------+
|site |
+------------+
|JPN Tokyo |
|AUS Sydney |
|CHN Beijing |
But I'd like to make duplicate rows of the existing rows but with the 2nd and 3rd character changed to lowercase such that the dataframe becomes like this:
+------------+
|site |
+------------+
|JPN Tokyo |
|Jpn Tokyo |
|AUS Sydney |
|Aus Sydney |
|CHN Beijing |
|Chn Beijing |
Would anyone have an idea how to do that?
We expand the rows with uncount, then create a logical condition with duplicated on the 'site', replace the substring values to lower case using sub within case_when
library(dplyr)
library(tidyr)
library(stringr)
df1 <- df1 %>%
uncount(2) %>%
mutate(site = case_when(duplicated(site)
~ sub("^(.)(\\w+)", "\\1\\L\\2", site, perl = TRUE),
TRUE ~ site))
-output
df1
# A tibble: 6 x 1
site
<chr>
1 JPN Tokyo
2 Jpn Tokyo
3 AUS Sydney
4 Aus Sydney
5 CHN Beijing
6 Chn Beijing
data
df1 <- structure(list(site = c("JPN Tokyo", "AUS Sydney", "CHN Beijing"
)), class = "data.frame", row.names = c(NA, -3L))
edit: #AnilGoyal suggested the use of map_dfr, that reduced the call to only one line.
library(tidyverse)
data <-
tribble(
~site,
'JPN Tokyo',
'AUS Sydney',
'CHN Beijing' )
#option1
map_dfr(data$site, ~list(sites = c(.x, str_to_title(.x))))
#> # A tibble: 6 x 1
#> sites
#> <chr>
#> 1 JPN Tokyo
#> 2 Jpn Tokyo
#> 3 AUS Sydney
#> 4 Aus Sydney
#> 5 CHN Beijing
#> 6 Chn Beijing
#option2
map(data$site, ~rbind(.x, str_to_title(.x))) %>%
reduce(rbind) %>%
tibble(site = .)
#> # A tibble: 6 x 1
#> site[,1]
#> <chr>
#> 1 JPN Tokyo
#> 2 Jpn Tokyo
#> 3 AUS Sydney
#> 4 Aus Sydney
#> 5 CHN Beijing
#> 6 Chn Beijing
Created on 2021-06-08 by the reprex package (v2.0.0)
You can use substr to replace characters at specific position.
df1 <- df
substr(df1$site, 2, 3) <- tolower(substr(df1$site, 2, 3))
df1
# site
#1 Jpn Tokyo
#2 Aus Sydney
#3 Chn Beijing
res <- rbind(df1, df)
res[order(res$site), , drop = FALSE]
# site
#2 Aus Sydney
#5 AUS Sydney
#3 Chn Beijing
#6 CHN Beijing
#1 Jpn Tokyo
#4 JPN Tokyo

Selecting a column with a dot in R (nested object)

I'm new to R and I'm not sure how to rephrase the question, but basically, I have this dataset coming from the following code:
data_url <- 'https://prod-scores-api.ausopen.com/year/2021/stats'
dat <- jsonlite::fromJSON(data_url)
men_aces <- bind_rows(dat$statistics$rankings[[1]]$players[1])
men_aces_table <- dat$players %>%
inner_join(men_aces, by = c('uuid' = 'player_id')) %>% select(full_name, nationality)
Which resulted in this data frame:
full_name nationality.uuid nationality.name nationality.code
1 Novak Djokovic 99da9b29-eade-4ac3-a7b0-b0b8c2192df7 Serbia SRB
2 Alexander Zverev 99d83e85-3173-4ccc-9d91-8368720f4a47 Germany GER
3 Milos Raonic 07779acb-6740-4b26-a664-f01c0b54b390 Canada CAN
4 Daniil Medvedev fa925d2d-337f-4074-a0bd-afddb38d66e1 Russia RUS
5 Nick Kyrgios 9b11f78c-47c1-43c4-97d0-ba3381eb9f07 Australia AUS
nationality is the nested object inside the player object if you check the JSON url, it contains the above properties (uuid, name, code), if I select the full_name property I would get the value (which is of type character) right back.
I'm not sure how to select the name and from that data frame (nationality) and rename it to country.
My expected outcome is:
full_name country
1 Novak Djokovic Serbia
2 Alexander Zverev Germany
3 Milos Raonic Canada
4 Daniil Medvedev Russia
5 Nick Kyrgios Australia
I would appreciate some help. Sorry I was unclear.
Use purrr::pmap_chr
library(tidyverse)
dat$players %>%
inner_join(men_aces, by = c('uuid' = 'player_id')) %>%
select(full_name, nationality) %>%
mutate(nationality = pmap_chr(nationality, ~ ..2))
full_name nationality
1 Novak Djokovic Serbia
2 Alexander Zverev Germany
3 Milos Raonic Canada
4 Daniil Medvedev Russia
5 Nick Kyrgios Australia
6 Alexander Bublik Kazakhstan
7 Reilly Opelka United States of America
8 Jiri Vesely Czech Republic
9 Andrey Rublev Russia
10 Lloyd Harris South Africa
11 Aslan Karatsev Russia
12 Taylor Fritz United States of America
13 Matteo Berrettini Italy
14 Grigor Dimitrov Bulgaria
15 Feliciano Lopez Spain
16 Stefanos Tsitsipas Greece
17 Felix Auger-Aliassime Canada
18 Thanasi Kokkinakis Australia
19 Ugo Humbert France
20 Borna Coric Croatia
You could do:
bind_cols(full_name = dat$players$full_name, country = dat$players$nationality$name)
# A tibble: 169 x 2
full_name country
<chr> <chr>
1 Novak Djokovic Serbia
2 Alexander Zverev Germany
3 Milos Raonic Canada
4 Daniil Medvedev Russia
5 Nick Kyrgios Australia
6 Alexander Bublik Kazakhstan
7 Reilly Opelka United States of America
8 Jiri Vesely Czech Republic
9 Andrey Rublev Russia
10 Lloyd Harris South Africa
just add this line at the end
newdf <- data.frame(full_name = men_aces_table$full_name, country = men_aces_table$nationality$name)

html_table doubles value of columns

I'm trying to scrape wiki table with this code:
library(tidyverse)
library(rvest)
my_url <- "https://en.wikipedia.org/wiki/List_of_Australian_Open_men%27s_singles_champions"
mytable <- read_html(my_url) %>% html_nodes("table") %>% .[[4]]
mytable <- mytable %>% html_table()
The problem is that in the table returned in both columns with names (champion & runner-up) values are doubled. well not exactly doubled, it looks like two forms of presenting name/surname in different order and with comma once. It does not look like that on the original wiki page only "name surname" is visible there. Why does it happen and how to get rid of it? I need those columns to contain 'name surname' only.
head(mytable)
Year[f] Country Champion Country Runner-up Score in the final[4][14]
1 1969 AUS Laver, RodRod Laver[b] ESP Gimeno, AndrésAndrés Gimeno 6–3, 6–4, 7–5
2 1970 USA Ashe, ArthurArthur Ashe AUS Crealy, DickDick Crealy 6–4, 9–7, 6–2
3 1971 AUS Rosewall, KenKen Rosewall USA Ashe, ArthurArthur Ashe 6–1, 7–5, 6–3
4 1972 AUS Rosewall, KenKen Rosewall AUS Anderson, MalcolmMalcolm Anderson 7–6(7–2), 6–3, 7–5
5 1973 AUS Newcombe, JohnJohn Newcombe NZL Parun, OnnyOnny Parun 6–3, 6–7, 7–5, 6–1
6 1974 USA Connors, JimmyJimmy Connors AUS Dent, PhilPhil Dent 7–6(9–7), 6–4, 4–6, 6–3
htmltab could be used to scrap these Wiki tables.
library(htmltab)
#data cleaning steps
bFun <- function(node) {
x <- XML::xmlValue(node)
gsub("\\s[<†‡].*$", "", iconv(x, from = 'UTF-8', to = "Windows-1252", sub="byte"))
}
df1 <- htmltab(doc = "https://en.wikipedia.org/wiki/List_of_Australian_Open_men%27s_singles_champions",
which = 4,
rm_superscript = F,
bodyFun = bFun) #this function is not required if you are executing the code from Mac
head(df1)
which gives
# Year[f] Country Champion Country Runner-up Score in the final[4][14]
#2 1969 AUS Rod Laver[b] ESP Andrés Gimeno 6–3, 6–4, 7–5
#3 1970 USA Arthur Ashe AUS Dick Crealy 6–4, 9–7, 6–2
#4 1971 AUS Ken Rosewall USA Arthur Ashe 6–1, 7–5, 6–3
#5 1972 AUS Ken Rosewall AUS Malcolm Anderson 7–6(7–2), 6–3, 7–5
#6 1973 AUS John Newcombe NZL Onny Parun 6–3, 6–7, 7–5, 6–1
#7 1974 USA Jimmy Connors AUS Phil Dent 7–6(9–7), 6–4, 4–6, 6–3
and
df2 <- htmltab(doc = "https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions",
which = 3,
rm_superscript = F,
bodyFun = bFun) #this function is not required if you are executing the code from Mac
head(df2)
gives
# Year[d] Country Champion Country Runner-up Score in the final[4]
#2 1877 BRI[e] Spencer Gore BRI William Marshall 6–1, 6–2, 6–4
#3 1878 BRI Frank Hadow BRI Spencer Gore 7–5, 6–1, 9–7
#4 1879 BRI John Hartley BRI Vere St. Leger Goold 6–2, 6–4, 6–2
#5 1880 BRI John Hartley BRI Herbert Lawford 6–3, 6–2, 2–6, 6–3
#6 1881 BRI William Renshaw BRI John Hartley 6–0, 6–1, 6–1
#7 1882 BRI William Renshaw BRI Ernest Renshaw 6–1, 2–6, 4–6, 6–2, 6–2

R group or aggregate

I would like to do a group_by or aggregate. I have something like:
> head(affiliation_clean)
Affiliation_ID Affiliation_Name City Country
1 000001 New Mexico State University Las Cruces Las Cruces United States
2 000001 New Mexico State University Las Cruces Las Cruces <NA>
3 000001 New Mexico State University Las Cruces <NA> <NA>
4 000002 Palo Alto Research Center Incorporated Palo Alto <NA>
5 000002 Palo Alto Research Center Incorporated <NA> United States
6 000002 Palo Alto Research Center Incorporated <NA> <NA>
Grouping by "Affiliation_ID" and taking the longest string of "Affiliation_Name", "City" and "Country", I would like to get:
> head(affiliation_clean)
Affiliation_ID Affiliation_Name City Country
1 000001 New Mexico State University Las Cruces Las Cruces United States
2 000002 Palo Alto Research Center Incorporated Palo Alto United States
Thanks in advance.
Here is a dplyr solution based on your description to select the longest string of each Affiliation_ID and column.
library(dplyr)
dat2 <- dat %>%
group_by(Affiliation_ID) %>%
summarise_all(funs(.[which.max(nchar(.))][1]))
dat2
# # A tibble: 2 x 4
# Affiliation_ID Affiliation_Name City Country
# <int> <chr> <chr> <chr>
# 1 1 New Mexico State University Las Cruces Las Cruces United States
# 2 2 Palo Alto Research Center Incorporated Palo Alto United States
DATA
dat <-read.table(text = " Affiliation_ID Affiliation_Name City Country
1 '000001' 'New Mexico State University Las Cruces' 'Las Cruces' 'United States'
2 '000001' 'New Mexico State University Las Cruces' 'Las Cruces' NA
3 '000001' 'New Mexico State University Las Cruces' NA NA
4 '000002' 'Palo Alto Research Center Incorporated' 'Palo Alto' NA
5 '000002' 'Palo Alto Research Center Incorporated' NA 'United States'
6 '000002' 'Palo Alto Research Center Incorporated' NA NA",
header = TRUE, stringsAsFactors = FALSE)
Assuming that there is a single unique 'City/Country' for each 'Affiliation_ID', 'Affiliation_Name', after grouping at the first two columns, get the unique non-NA element of all other columns with summarise_all
library(dplyr)
affiliation_clean %>%
group_by(Affiliation_ID, Affiliation_Name) %>%
summarise_all(funs(unique(.[!is.na(.)])) )
# A tibble: 2 x 4
# Groups: Affiliation_ID [?]
# Affiliation_ID Affiliation_Name City Country
# <chr> <chr> <chr> <chr>
#1 000001 New Mexico State University Las Cruces Las Cruces United States
#2 000002 Palo Alto Research Center Incorporated Palo Alto United States

Resources