How to remove rows that have repeated elements? - r

I have a dataframe that looks like this (but for every US county)
county
state
neighbor_county
neighbor_state
Baldwin County
AL
Clarke County
NA
Baldwin County
AL
Escambia County
FL
Baldwin County
AL
Mobile County
NA
Baldwin County
AL
Monroe County
NA
Barbour County
AL
Dale County
NA
Barbour County
AL
Henry County
NA
I am only interested in what states neighbor a county, so I want to remove repeated data to get this (step 1):
county
state
neighbor_state
Baldwin County
AL
NA
Baldwin County
AL
FL
Barbour County
AL
NA
And then change sort the dataframe like this (step 2):
county
state
neighbor_state_1
neighbor_state_2
neighbor_state_3
Baldwin County
AL
FL
NA
NA
Baldwin County
AL
NA
NA
NA
In step 1 I've deleted the "neighbor_county" column; however, I've not managed to remove the duplicates in the column "neighbor_state" for each distinct county. I have tried using the unique function but I can't seem to make it work such that it only removes duplicates of each distinct county.

For your first step you could drop the neighbour_county column and the use unique():
df$neighbor_county <- NULL
unique(df)
returns
county state neighbor_state
1 Baldwin_County AL NA
2 Baldwin_County AL FL
5 Barbour_County AL NA
An alternative using dplyr:
df %>%
select(-neighbor_county) %>%
distinct()
For your second step I make a suggestion:
library(tidyr)
library(dplyr)
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
returns
# A tibble: 2 x 6
county state neighbor_state_1 neighbor_state_2 neighbor_state_3 neighbor_state_4
<chr> <chr> <chr> <chr> <chr> <chr>
1 Baldwin_County AL 'NA' 'FL' 'NA' 'NA'
2 Barbour_County AL 'NA' 'NA' NA NA
but I'm not sure, if this is what you are looking for.
For removing doubled NA-values, you could use
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
distinct() %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
Data
structure(list(county = c("Baldwin_County", "Baldwin_County",
"Baldwin_County", "Baldwin_County", "Barbour_County", "Barbour_County"
), state = c("AL", "AL", "AL", "AL", "AL", "AL"), neighbor_county = c("Clarke_County",
"Escambia_County", "Mobile_County", "Monroe_County", "Dale_County",
"Henry_County"), neighbor_state = c("'NA'", "'FL'", "'NA'", "'NA'",
"'NA'", "'NA'")), problems = structure(list(row = 6L, col = "neighbor_state",
expected = "", actual = "embedded null", file = "literal data"), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), class = "data.frame", row.names = c(NA,
-6L), spec = structure(list(cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), neighbor_county = structure(list(), class = c("collector_character",
"collector")), neighbor_state = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Related

How can I add columns to a data frame with a value determined by values in other columns?

I have a dataset on population in counties in the US. I want to add a column for what state the county is in and one for the county code. Both are already available in the dataset but "hid".
For instance, from the output we can see that the first observation says NAME = "Ada County, Idaho" and GEOID = "16001". I want one column with State = "Idaho" and one column with StateID = "16".
Thank you!
structure(list(NAME = c("Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho", "Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho"), GEOID = c("16001", "16001", "16001", "16001",
"16001", "16001"), year = c("2007", "2007", "2007", "2007", "2007",
"2007"), POP25 = c(205888, 205888, 205888, 205888, 205888, 205888
), EMPLOY25 = c(205888, 208506, 212770, 212272, 216058, 220856
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), groups = structure(list(NAME = "Ada County, Idaho", GEOID = "16001",
.rows = structure(list(1:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))
Perhaps this helps - remove the substring in 'NAME' till the , followed by one or more spaces (\\s+) to create the 'State' and the 'StateID' from the first two characters of 'GEOID' column using substr
library(dplyr)
library(stringr)
df1 %>%
ungroup %>%
mutate(State = str_remove(NAME, ".*,\\s+"),
StateID = substr(GEOID, 1, 2))
Here is an alternative using str_extract and str_sub:
library(dplyr)
library(stringr)
pattern <- paste(state.name, collapse="|")
df %>%
mutate(State = str_extract(NAME, pattern),
StateID = str_sub(GEOID, 1, 2))
NAME GEOID year POP25 EMPLOY25 State StateID
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 Ada County, ~ 16001 2007 205888 205888 Idaho 16
2 Ada County, ~ 16001 2007 205888 208506 Idaho 16
3 Ada County, ~ 16001 2007 205888 212770 Idaho 16
4 Ada County, ~ 16001 2007 205888 212272 Idaho 16
5 Ada County, ~ 16001 2007 205888 216058 Idaho 16
6 Ada County, ~ 16001 2007 205888 220856 Idaho 16

Create new variable based on a condition across multiple columns

I have a binary variable ("Penalty") and 30 factors with the same levels: "Discharge", "Suspended", "Fine", "Community order", and "Imprisonment".
A small example:
ID
Possession
Importation
Production
Penalty
1
Fine
NA
Fine
Yes
2
NA
NA
Community order
No
3
Discharge
Discharge
NA
No
4
NA
NA
Suspended
Yes
5
Imprisonment
NA
NA
No
6
Fine
NA
Imprisonment
No
I would like to create a new factor based on the same condition across these columns plus the binary variable and where there are differing levels in the same row would like the new variable 'sentence' to retain the levels with this priority: Imprisonment > Community order, Suspended > Fine > Discharge. e.g. Discharge will only be present in the new column where no other level appears.
Desired output:
ID
Possession
Importation
Production
Penalty
Sentence
1
Fine
NA
Fine
Yes
Fine
2
NA
NA
Community order
No
Community order
3
Discharge
Discharge
NA
No
Discharge
4
NA
NA
Suspended
Yes
Suspended
5
Imprisonment
NA
NA
No
Imprisonment
6
Fine
NA
Imprisonment
No
Imprisonment
This is what I have attempted: (where "vec" is a vector of the factor column indices)
data <- data %>%
mutate(
crim_sanct = case_when(
(if_any(vec) == "Discharge") ~ "Discharge",
(if_any(vec) == "Fine") | (data$Penalty == "Yes") ~ "Fine",
(if_any(vec) == "Suspended") ~ "Suspended",
(if_any(vec) == "Community order") ~ "Community order",
(if_any(vec) == "Imprisonment") ~ "imprisonment"))
You are in the right direction but have some small syntax issues in if_any.
Also in case_when you need to put the conditions based on the priority. So if Imprisonment > Community order then Imprisonment condition should come first before Community order.
library(dplyr)
data <- data %>%
mutate(
crim_sanct =
case_when(
if_any(Possession:Production, ~. == "Imprisonment") ~ "imprisonment",
if_any(Possession:Production, ~ . == "Discharge") ~ "Discharge",
if_any(Possession:Production, ~. == "Suspended") ~ "Suspended",
if_any(Possession:Production, ~. == "Fine") | (Penalty == "Yes") ~ "Fine",
if_any(Possession:Production, ~. == "Community order") ~ "Community order")
)
data
# ID Possession Importation Production Penalty crim_sanct
#1 1 Fine <NA> Fine Yes Fine
#2 2 <NA> <NA> Community order No Community order
#3 3 Discharge Discharge <NA> No Discharge
#4 4 <NA> <NA> Suspended Yes Suspended
#5 5 Imprisonment <NA> <NA> No imprisonment
#6 6 Fine <NA> Imprisonment No imprisonment
Since I don't know how to handle the Penalty column, we ignore it for now. Creating a column Sentence based on the columns Possession, Importation and Production could be done with
library(dplyr)
data %>%
mutate(across(
Possession:Production,
~ factor(.x,
c("Imprisonment", "Community order", "Suspended", "Fine", "Discharge"),
ordered = TRUE))) %>%
rowwise() %>%
mutate(Sentence = min(c_across(Possession:Production), na.rm = TRUE)) %>%
ungroup()
which returns
# A tibble: 6 x 6
ID Possession Importation Production Penalty Sentence
<dbl> <ord> <ord> <ord> <chr> <ord>
1 1 Fine NA Fine Yes Fine
2 2 NA NA Community order No Community order
3 3 Discharge Discharge NA No Discharge
4 4 NA NA Suspended Yes Suspended
5 5 Imprisonment NA NA No Imprisonment
6 6 Fine NA Imprisonment No Imprisonment
The main idea here is creating ordered factors and using a rowwise min-function to get the sentence with the hightest priority.
Data
data <- structure(list(ID = c(1, 2, 3, 4, 5, 6), Possession = c("Fine",
NA, "Discharge", NA, "Imprisonment", "Fine"), Importation = c(NA,
NA, "Discharge", NA, NA, NA), Production = c("Fine", "Community order",
NA, "Suspended", NA, "Imprisonment"), Penalty = c("Yes", "No",
"No", "Yes", "No", "No")), problems = structure(list(row = 6L,
col = "Penalty", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = "data.frame", row.names = c(NA,
-6L), spec = structure(list(cols = list(ID = structure(list(), class = c("collector_double",
"collector")), Possession = structure(list(), class = c("collector_character",
"collector")), Importation = structure(list(), class = c("collector_character",
"collector")), Production = structure(list(), class = c("collector_character",
"collector")), Penalty = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Moving elements from column to column in r

I have a dataframe that looks like this (but for every US county)
county
state
n_state_1
n_state_2
n_state_3
n_state_4
Autauga County
AL
NA
FL
NA
NA
Baldwin County
AL
GA
NA
TN
NA
Catron County
AL
FL
GA
NA
CA
I want to move the non-missing values (FL,GA,TN etc.) to the first columns starting from n_state_1 and then delete the columns containing only missing values to get:
county
state
n_state_1
n_state_2
n_state_3
Autauga County
AL
FL
NA
NA
Baldwin County
AL
GA
TN
NA
Catron County
AL
FL
GA
CA
I am struggling with the first step. I thought about using the function distinct but it doesn't work because there are non-empty elements in each column.
You could use dplyr and tidyr:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(starts_with("n_state")) %>%
drop_na() %>%
group_by(county, state) %>%
mutate(name=row_number()) %>%
pivot_wider(names_prefix="n_state_")
which returns
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL NA NA
2 Baldwin_County AL GA TN NA
3 Catron_County AL FL GA CA
What happened here?
pivot_longer takes the n_state_{n}-columns and collapses them into two columns: the name-column contains the original column name (n_state_1, n_state_2 etc), the value-column contains the states (FL, GA or <NA> in many cases).
Next we remove every <NA> entry. (Note: I use <NA> to make clear it's an NA-value).)
After a grouping by county and state we add a rownumber. These numbers will be later used to create the new column names.
pivot_wider now takes these row numbers and prefixes them with n_state_ to get the new columns. The values are taken from the value-column created in the second line of code. pivot_wider fills the missing values with <NA>-values (default behaviour).
Data
structure(list(county = c("Autauga_County", "Baldwin_County",
"Catron_County"), state = c("AL", "AL", "AL"), n_state_1 = c(NA,
"GA", "FL"), n_state_2 = c("FL", NA, "GA"), n_state_3 = c(NA,
"TN", NA), n_state_4 = c(NA, NA, "CA")), problems = structure(list(
row = 3L, col = "n_state_4", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), n_state_1 = structure(list(), class = c("collector_character",
"collector")), n_state_2 = structure(list(), class = c("collector_character",
"collector")), n_state_3 = structure(list(), class = c("collector_character",
"collector")), n_state_4 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Or another option with dapply from collapse and select only columns with any non-NA elements
library(collapse)
library(dplyr)
dapply(df1, MARGIN = 1, FUN = function(x) c(x[!is.na(x)], x[is.na(x)])) %>%
select(where(~ any(complete.cases(.))))
# A tibble: 3 x 5
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL <NA> <NA>
2 Baldwin_County AL GA TN <NA>
3 Catron_County AL FL GA CA

Gather data in r with multiple columns

I have some data which I am trying to use tidy R and pivot longer function in R to get the out put as mentioned below. But I am not able to do it, I am getting Data
I have data in this format. ( with many other column names )
Country State Year 1 Population 1 Year 2 Population2
U.S.A IL 2009 20000 2010 30000
U.S.A VA 2009 30000 2010 40000
I want to get data in this format.
Country State Year Population
U.S.A IL 2009 20000
U.S.A IL 2010 30000
U.S.A VA 2009 30000
U.S.A VA 2010 40000
I am able to do it only for on column, but not able to pass other column likes like population
My code is below.
file1<-file %>%
pivot_longer(
cols = contains("Year"),
names_sep = "_",
names_to = c(".value", "repeat"),
)
I was able to make it work on Tidyverse.
library(tidyverse)
file<-read_excel("peps300.xlsx")
names(file)<-str_replace_all(names(file), c("Year " = "Year_" , "Num " = "Num_", "DRate " = "DRate_" , "PRate " = "PRate_", "Denom " = "Denom_"))
file<-file %>%
pivot_longer(
cols = c(contains("Year"),contains("Num"),contains("DRate"),contains("PRate"),contains("Denom")),
names_sep = "_",
names_to = c(".value", "repeat")
)
An option would be to specify the cols that starts_with "Population" or "Year"
library(dplyr)
df1 %>%
pivot_longer(cols = c(starts_with("Population"), starts_with("Year")),
names_to = c(".value", "group"), names_pattern = "(.*)_(.*)")
# A tibble: 4 x 5
# Country State group Population Year
# <chr> <chr> <chr> <int> <int>
#1 U.S.A IL 1 20000 2009
#2 U.S.A IL 2 30000 2010
#3 U.S.A VA 1 30000 2009
#4 U.S.A VA 2 40000 2010
data
df1 <- structure(list(Country = c("U.S.A", "U.S.A"), State = c("IL",
"VA"), Year_1 = c(2009L, 2009L), Population_1 = c(20000L, 30000L
), Year_2 = c(2010L, 2010L), Population_2 = c(30000L, 40000L)),
class = "data.frame", row.names = c(NA,
-2L))
df %>%
pivot_longer(
-c(Country,State),
names_to = c(".value","group"),
names_pattern = "(.+)_(.+)"
)
# A tibble: 4 x 5
Country State group Year Population
<chr> <chr> <chr> <chr> <chr>
1 U.S.A IL 1 2009 20000
2 U.S.A IL 2 2010 30000
3 U.S.A VA 1 2009 30000
4 U.S.A VA 2 2010 40000
You can then drop the group if you don't need it.
And, to do this, you will need to clean your column names first. Make sure they all follow the same pattern and words are connected with a single space or a single underscore.
df <- structure(list(Country = c("U.S.A", "U.S.A"), State = c("IL",
"VA"), Year_1 = c("2009", "2009"), Population_1 = c("20000",
"30000"), Year_2 = c("2010", "2010"), Population_2 = c("30000",
"40000")), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L))

Spreading a Column to Find Largest/Smallest Observations

Currently struggling with a problem finding the country where billionaires are the oldest and youngest on average in a given country (data set shown below).
Moreover, I need to remove countries that have less than five observations.
I have code below that shows a table of age, worth in billions, and country code. I have countries sorted, but I am unsure of the best strategy to find which country has the youngest/oldest billionaires.
I have one line struck out where I am trying to spread by country name, but I think that will probably be messy.
Small sample here:
# A tibble: 2,614 x 22
age category citizenship company.name company.type `country code` founded
<int> <chr> <chr> <chr> <chr> <chr> <int>
1 NA Financi… Saudi Arab… Rolaco Trad… new SAU 1968
2 34 Financi… United Sta… Fidelity In… new USA 1946
3 59 Non-Tra… Brazil Companhia B… new BRA 1948
4 61 New Sec… Germany Ratiopharm new DEU 1881
5 NA Financi… Hong Kong Swire new HKG 1816
6 NA Traded … Bahrain YBA Kanoo new BHR 1890
7 NA New Sec… Japan Otsuka Hold… new JPN 1921
8 NA Traded … Japan Sony new JPN 1946
9 66 Financi… Japan Mori Buildi… new JPN 1959
10 NA Traded … France Chanel new FRA 1909
# … with 2,604 more rows, and 15 more variables: `from emerging` <chr>,
# gdp <dbl>, gender <chr>, industry <chr>, inherited <chr>, name <chr>,
# rank <int>, region <chr>, relationship <chr>, sector <chr>, `was
# founder` <chr>, `was political` <chr>, wealth.type <chr>, `worth in
# billions` <dbl>, year <int>
dput(head(bil))
structure(list(age = c(NA, 34L, 59L, 61L, NA, NA), category = c("Financial",
"Financial", "Non-Traded Sectors", "New Sectors", "Financial",
"Traded Sectors"), citizenship = c("Saudi Arabia", "United States",
"Brazil", "Germany", "Hong Kong", "Bahrain"), company.name = c("Rolaco Trading and Contracting Company",
"Fidelity Investments", "Companhia Brasileira de Distribui?ao",
"Ratiopharm", "Swire", "YBA Kanoo"), company.type = c("new",
"new", "new", "new", "new", "new"), country_code = c("SAU", "USA",
"BRA", "DEU", "HKG", "BHR"), founded = c(1968L, 1946L, 1948L,
1881L, 1816L, 1890L), `from emerging` = c("True", "True", "True",
"True", "True", "True"), gdp = c(1.58e+11, 8.1e+12, 8.54e+11,
2.5e+12, 1.6e+11, 6.1e+09), gender = c("male", "female", "male",
"male", "male", "male"), industry = c("Money Management", "Money Management",
"Retail, Restaurant", "Technology-Medical", "Money Management",
"Consumer"), inherited = c("True", "True", "True", "True", "True",
"True"), name = c("Abdul Aziz Al-Sulaiman", "Abigail Johnson",
"Abilio dos Santos Diniz", "Adolf Merckle", "Adrian and John Swire",
"Ahmed Ali Kanoo"), rank = c(404L, 145L, 322L, 388L, 162L, 383L
), region = c("Middle East/North Africa", "North America", "Latin America",
"Europe", "East Asia", "Middle East/North Africa"), relationship = c("founder",
"relation", "relation", "relation", "relation", "relation"),
sector = c("construction", "investment banking", "retail",
"pharmaceuticals", "trading company", "shipping"), `was founder` = c("True",
"True", "True", "True", "True", "True"), `was political` = c("False",
"False", "False", "False", "False", "True"), wealth.type = c("self-made finance",
"inherited", "inherited", "inherited", "inherited", "inherited"
), worth_billions = c(1, 2.5, 1.2, 1, 2.2, 1), year = c(1996L,
1996L, 1996L, 1996L, 1996L, 1996L)), row.names = c(NA, -6L
), spec = structure(list(cols = list(age = structure(list(), class = c("collector_integer",
"collector")), category = structure(list(), class = c("collector_character",
"collector")), citizenship = structure(list(), class = c("collector_character",
"collector")), company.name = structure(list(), class = c("collector_character",
"collector")), company.type = structure(list(), class = c("collector_character",
"collector")), `country code` = structure(list(), class = c("collector_character",
"collector")), founded = structure(list(), class = c("collector_integer",
"collector")), `from emerging` = structure(list(), class = c("collector_character",
"collector")), gdp = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_character",
"collector")), industry = structure(list(), class = c("collector_character",
"collector")), inherited = structure(list(), class = c("collector_character",
"collector")), name = structure(list(), class = c("collector_character",
"collector")), rank = structure(list(), class = c("collector_integer",
"collector")), region = structure(list(), class = c("collector_character",
"collector")), relationship = structure(list(), class = c("collector_character",
"collector")), sector = structure(list(), class = c("collector_character",
"collector")), `was founder` = structure(list(), class = c("collector_character",
"collector")), `was political` = structure(list(), class = c("collector_character",
"collector")), wealth.type = structure(list(), class = c("collector_character",
"collector")), `worth in billions` = structure(list(), class = c("collector_double",
"collector")), year = structure(list(), class = c("collector_integer",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector"))), class = "col_spec"), class = c("tbl_df", "tbl",
"data.frame"))
load("bil.RData")
print(bil)
# Renaming a few columns for spacing issues
colnames(bil)[21] <- "worth_billions"
colnames(bil)[6] <- "country_code"
# Finding where billionaires are oldest/youngest on average,
# ... then removing less than five observations
bil %>%
filter(!is.na(age)) %>%
select(age, worth_billions, country_code) %>%
group_by(age, worth_billions, country_code) %>%
mutate(count = n()) %>%
arrange(country_code) %>%
#spread(key = country_code, value = "USA") %>%
print()
I expect to find the country that has the oldest billionaires and youngest billionaires, excluding countries with fewer than five observations. Any help is appreciated!
After removing the NA elements in 'age' (filter), grouped by 'country_code' and filter out the groups having less than 5 billionaires, then summarise the mean of 'age' and slice the row having the maximum value for 'ageMean'
library(dplyr)
bil %>%
filter(!is.na(age)) %>%
group_by(country_code) %>%
filter(sum(worth_billions) > 1.0) > 5) %>%
summarise(ageMean = mean(age)) %>%
slice(which.max(ageMean))

Resources