I have a dataframe that looks like this (but for every US county)
county
state
n_state_1
n_state_2
n_state_3
n_state_4
Autauga County
AL
NA
FL
NA
NA
Baldwin County
AL
GA
NA
TN
NA
Catron County
AL
FL
GA
NA
CA
I want to move the non-missing values (FL,GA,TN etc.) to the first columns starting from n_state_1 and then delete the columns containing only missing values to get:
county
state
n_state_1
n_state_2
n_state_3
Autauga County
AL
FL
NA
NA
Baldwin County
AL
GA
TN
NA
Catron County
AL
FL
GA
CA
I am struggling with the first step. I thought about using the function distinct but it doesn't work because there are non-empty elements in each column.
You could use dplyr and tidyr:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(starts_with("n_state")) %>%
drop_na() %>%
group_by(county, state) %>%
mutate(name=row_number()) %>%
pivot_wider(names_prefix="n_state_")
which returns
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL NA NA
2 Baldwin_County AL GA TN NA
3 Catron_County AL FL GA CA
What happened here?
pivot_longer takes the n_state_{n}-columns and collapses them into two columns: the name-column contains the original column name (n_state_1, n_state_2 etc), the value-column contains the states (FL, GA or <NA> in many cases).
Next we remove every <NA> entry. (Note: I use <NA> to make clear it's an NA-value).)
After a grouping by county and state we add a rownumber. These numbers will be later used to create the new column names.
pivot_wider now takes these row numbers and prefixes them with n_state_ to get the new columns. The values are taken from the value-column created in the second line of code. pivot_wider fills the missing values with <NA>-values (default behaviour).
Data
structure(list(county = c("Autauga_County", "Baldwin_County",
"Catron_County"), state = c("AL", "AL", "AL"), n_state_1 = c(NA,
"GA", "FL"), n_state_2 = c("FL", NA, "GA"), n_state_3 = c(NA,
"TN", NA), n_state_4 = c(NA, NA, "CA")), problems = structure(list(
row = 3L, col = "n_state_4", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), n_state_1 = structure(list(), class = c("collector_character",
"collector")), n_state_2 = structure(list(), class = c("collector_character",
"collector")), n_state_3 = structure(list(), class = c("collector_character",
"collector")), n_state_4 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Or another option with dapply from collapse and select only columns with any non-NA elements
library(collapse)
library(dplyr)
dapply(df1, MARGIN = 1, FUN = function(x) c(x[!is.na(x)], x[is.na(x)])) %>%
select(where(~ any(complete.cases(.))))
# A tibble: 3 x 5
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL <NA> <NA>
2 Baldwin_County AL GA TN <NA>
3 Catron_County AL FL GA CA
Related
I have a table with prefixes (here in csv format):
PREFIX,LABEL
A,Infectious diseases
B,Infectious diseases
C,Tumor
D1,Tumor
D2,Tumor
D31,Tumor
D32,Tumor
D33,Blood disorder
D4,Blood disorder
D5,Blood disorder
And I want to join it with this one:
AGE,DEATH_CODE
67,A02
85,D318
75,C007+X
62,D338
To get obviously:
AGE,LABEL
67,Infectious diseases
85,Tumor
75,Tumor
62,Blood disorder
I know how to do that with SQL and LIKE but not with tidyverse left_join or base R.
Dput of data
Table 1: CIM_CODES
structure(list(PREFIX = c("A", "B", "C", "D1", "D2", "D31", "D32",
"D33", "D4", "D5"), LABEL = c("Infectious diseases", "Infectious diseases",
"Tumor", "Tumor", "Tumor", "Tumor", "Tumor", "Blood disorder",
"Blood disorder", "Blood disorder")), row.names = c(NA, -10L), spec = structure(list(
cols = list(PREFIX = structure(list(), class = c("collector_character",
"collector")), LABEL = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000002527d306190>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Table 2: DEATH_CAUSES
structure(list(AGE = c(67, 85, 75, 62), DEATH_CODE = c("A02",
"D318", "C007+X", "D338")), row.names = c(NA, -4L), spec = structure(list(
cols = list(AGE = structure(list(), class = c("collector_double",
"collector")), DEATH_CODE = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x0000025273898c60>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
You could do a partial string match that has the lowest difference between the two columns:
library(tidyverse)
DEATH_CAUSES |>
mutate(LABEL = map_chr(DEATH_CODE,
~CIM_CODES$LABEL[
which.min(stringdist::stringdist(.x, CIM_CODES$PREFIX))
]))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007+X Tumor
#> 4 62 D338 Blood disorder
UPDATE
not using the stringdist package as requested.
library(tidyverse)
get_match <- function(code, prefix, target){
map(code, \(x){
map(prefix, \(y){
grepl(paste0("^", y), x)
})
}) |>
map_chr(\(z) target[unlist(z) |> which()] )
}
DEATH_CAUSES |>
mutate(LABEL = get_match(DEATH_CAUSES$DEATH_CODE,
CIM_CODES$PREFIX,
CIM_CODES$LABEL))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007+X Tumor
#> 4 62 D338 Blood disorder
EDIT
how to do this with a join:
library(tidyverse)
library(fuzzyjoin)
fuzzy_left_join(DEATH_CAUSES,
CIM_CODES,
by = c("DEATH_CODE" = "PREFIX"),
str_detect)
#> # A tibble: 4 x 4
#> AGE DEATH_CODE PREFIX LABEL
#> <dbl> <chr> <chr> <chr>
#> 1 67 A02 A Infectious diseases
#> 2 85 D318 D31 Tumor
#> 3 75 C007+X C Tumor
#> 4 62 D338 D33 Blood disorder
My code below, I used mysql:
select a.age, p.label
from prefix p
left join age a on a.death_code like CONCAT("%",p.prefix,"%");
You can refer here: how to use a like with a join in sql?
I want to remove all rows after a certain string occurrence in a data frame column. I want to only return the 3 rows that appear above "total" appearing in column A. The 2 rows appearing below "total" would be excluded.
A B
Bob Smith 01005
Carl Jones 01008
Syndey Lewis 01185
total
Adam Price 01555
Megan Watson 02548
We can subset with row_numberand which
library(dplyr)
df %>% filter(row_number() < which(A=='total'))
A B
1 Bob Smith 01005
2 Carl Jones 01008
3 Syndey Lewis 01185
You could use
library(dplyr)
df %>%
filter(cumsum(A == "total") == 0)
This returns
# A tibble: 3 x 2
A B
<chr> <chr>
1 Bob Smith 01005
2 Carl Jones 01008
3 Syndey Lewis 01185
Data
structure(list(A = c("Bob Smith", "Carl Jones", "Syndey Lewis",
"total", "Adam Price", "Megan Watson"), B = c("01005", "01008",
"01185", NA, "01555", "02548")), problems = structure(list(row = 4L,
col = NA_character_, expected = "2 columns", actual = "1 columns",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -6L), spec = structure(list(
cols = list(A = structure(list(), class = c("collector_character",
"collector")), B = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
A <- c('Bob Smith','Carl Jones','Syndey Lewis','total','Adam Price','Megan Watson')
B <- c('01005','01008','01185','','01555','02548')
df <- data.frame(A, B)
val = which(df$A=="total") #get index of total
C = df[1:val-1,]
It's a little clunky but this should solve what you're wanting it to do:
library(dplyr)
df <- data.frame(A = c("Bob Smith", "Carl Jones", "Sydney Lewis", "total", "Adam Price", "Megan Watson"),
B = c("01005", "01008", "01185", NA, "01555", "02548"))
index <- df[df$A=="total",] %>% rownames()
df %>% slice(1:index)
I need a chart of accounts to stay in order when new accounts are added or dropped in future years. This is because in Accounting the accounts are sorted by type (for example Asset, Liability Equity) but it is not explicit in the dataset. This is an example of the code that is putting new "Accounts" from Year2 and Year3 at the bottom.
XYZCompany_Consolidated <- XYZCompany_Year1 %>%
full_join(XYZCompany_Year2 by = "Account") %>%
full_join(XYZCompany_Year3, by = "Account")
Example: This picture is just to give a simplified example. The highlight in orange is where the new accounts are going and to the right is the code i'm using, and the green is what I'm trying to achieve
Perhaps I'm overthinking this problem but I find it hard to solve. Let's define some data first:
df_year1 <- structure(list(Account = c("Cash", "Accounts", "Loan1", "Auto",
"JaneDoe"), Year_1 = c(100, 1000, 20, 300, 500)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(Account = structure(list(), class = c("collector_character",
"collector")), Year_1 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df_year2 <- structure(list(Account = c("Cash", "Accounts", "Loan1", "Auto",
"Laptop", "JaneDoe"), Year_2 = c(80, 1200, 50, 300, 500, 0)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), spec = structure(list(
cols = list(Account = structure(list(), class = c("collector_character",
"collector")), Year_2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df_year3 <- structure(list(Account = c("Cash", "Accounts", "Loan1", "Auto",
"Rent", "JaneDoe"), Year_3 = c(80, 1200, 50, 300, 1000, 0)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), spec = structure(list(
cols = list(Account = structure(list(), class = c("collector_character",
"collector")), Year_3 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Those are similar to the data shown in the OP's picture, e.g. df_year1 looks like
# A tibble: 5 x 2
Account Year_1
<chr> <dbl>
1 Cash 100
2 Accounts 1000
3 Loan1 20
4 Auto 300
5 JaneDoe 500
Next we transform those data a little bit, namely
library(dplyr)
library(tidyr)
df_y1 <- df_year1 %>%
mutate(Year = 1,
no = row_number()) %>%
rename(value = Year_1)
which returns
# A tibble: 5 x 4
Account value Year no
<chr> <dbl> <dbl> <int>
1 Cash 100 1 1
2 Accounts 1000 1 2
3 Loan1 20 1 3
4 Auto 300 1 4
5 JaneDoe 500 1 5
The new column no stores the account's original position, column Year stores the chart's year. All three data.frames are processed like this, so we get df_y1, df_y2, df_y3.
Finally we bind them together
bind_rows(df_y1, df_y2, df_y3) %>%
mutate(num_years = max(Year)) %>%
group_by(Account) %>%
mutate(rank = sum((num_years - n() + 1) * no), .keep = "unused") %>%
pivot_wider(names_from = Year) %>%
arrange(rank) %>%
select(-rank) %>%
ungroup()
and calculate a rank for each account. The accounts are ordered by this rank. As a result, we get
# A tibble: 7 x 4
Account Year_1 Year_2 Year_3
<chr> <dbl> <dbl> <dbl>
1 Cash 100 80 80
2 Accounts 1000 1200 1200
3 Loan1 20 50 50
4 Auto 300 300 300
5 Laptop NA 500 NA
6 Rent NA NA 1000
7 JaneDoe 500 0 0
Note
I believe, there are better approaches, but at least this works for the example data.
I'm not sure about the calculated rank's stability. Take care.
I have a dataframe that looks like this (but for every US county)
county
state
neighbor_county
neighbor_state
Baldwin County
AL
Clarke County
NA
Baldwin County
AL
Escambia County
FL
Baldwin County
AL
Mobile County
NA
Baldwin County
AL
Monroe County
NA
Barbour County
AL
Dale County
NA
Barbour County
AL
Henry County
NA
I am only interested in what states neighbor a county, so I want to remove repeated data to get this (step 1):
county
state
neighbor_state
Baldwin County
AL
NA
Baldwin County
AL
FL
Barbour County
AL
NA
And then change sort the dataframe like this (step 2):
county
state
neighbor_state_1
neighbor_state_2
neighbor_state_3
Baldwin County
AL
FL
NA
NA
Baldwin County
AL
NA
NA
NA
In step 1 I've deleted the "neighbor_county" column; however, I've not managed to remove the duplicates in the column "neighbor_state" for each distinct county. I have tried using the unique function but I can't seem to make it work such that it only removes duplicates of each distinct county.
For your first step you could drop the neighbour_county column and the use unique():
df$neighbor_county <- NULL
unique(df)
returns
county state neighbor_state
1 Baldwin_County AL NA
2 Baldwin_County AL FL
5 Barbour_County AL NA
An alternative using dplyr:
df %>%
select(-neighbor_county) %>%
distinct()
For your second step I make a suggestion:
library(tidyr)
library(dplyr)
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
returns
# A tibble: 2 x 6
county state neighbor_state_1 neighbor_state_2 neighbor_state_3 neighbor_state_4
<chr> <chr> <chr> <chr> <chr> <chr>
1 Baldwin_County AL 'NA' 'FL' 'NA' 'NA'
2 Barbour_County AL 'NA' 'NA' NA NA
but I'm not sure, if this is what you are looking for.
For removing doubled NA-values, you could use
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
distinct() %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
Data
structure(list(county = c("Baldwin_County", "Baldwin_County",
"Baldwin_County", "Baldwin_County", "Barbour_County", "Barbour_County"
), state = c("AL", "AL", "AL", "AL", "AL", "AL"), neighbor_county = c("Clarke_County",
"Escambia_County", "Mobile_County", "Monroe_County", "Dale_County",
"Henry_County"), neighbor_state = c("'NA'", "'FL'", "'NA'", "'NA'",
"'NA'", "'NA'")), problems = structure(list(row = 6L, col = "neighbor_state",
expected = "", actual = "embedded null", file = "literal data"), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), class = "data.frame", row.names = c(NA,
-6L), spec = structure(list(cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), neighbor_county = structure(list(), class = c("collector_character",
"collector")), neighbor_state = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Currently struggling with a problem finding the country where billionaires are the oldest and youngest on average in a given country (data set shown below).
Moreover, I need to remove countries that have less than five observations.
I have code below that shows a table of age, worth in billions, and country code. I have countries sorted, but I am unsure of the best strategy to find which country has the youngest/oldest billionaires.
I have one line struck out where I am trying to spread by country name, but I think that will probably be messy.
Small sample here:
# A tibble: 2,614 x 22
age category citizenship company.name company.type `country code` founded
<int> <chr> <chr> <chr> <chr> <chr> <int>
1 NA Financi… Saudi Arab… Rolaco Trad… new SAU 1968
2 34 Financi… United Sta… Fidelity In… new USA 1946
3 59 Non-Tra… Brazil Companhia B… new BRA 1948
4 61 New Sec… Germany Ratiopharm new DEU 1881
5 NA Financi… Hong Kong Swire new HKG 1816
6 NA Traded … Bahrain YBA Kanoo new BHR 1890
7 NA New Sec… Japan Otsuka Hold… new JPN 1921
8 NA Traded … Japan Sony new JPN 1946
9 66 Financi… Japan Mori Buildi… new JPN 1959
10 NA Traded … France Chanel new FRA 1909
# … with 2,604 more rows, and 15 more variables: `from emerging` <chr>,
# gdp <dbl>, gender <chr>, industry <chr>, inherited <chr>, name <chr>,
# rank <int>, region <chr>, relationship <chr>, sector <chr>, `was
# founder` <chr>, `was political` <chr>, wealth.type <chr>, `worth in
# billions` <dbl>, year <int>
dput(head(bil))
structure(list(age = c(NA, 34L, 59L, 61L, NA, NA), category = c("Financial",
"Financial", "Non-Traded Sectors", "New Sectors", "Financial",
"Traded Sectors"), citizenship = c("Saudi Arabia", "United States",
"Brazil", "Germany", "Hong Kong", "Bahrain"), company.name = c("Rolaco Trading and Contracting Company",
"Fidelity Investments", "Companhia Brasileira de Distribui?ao",
"Ratiopharm", "Swire", "YBA Kanoo"), company.type = c("new",
"new", "new", "new", "new", "new"), country_code = c("SAU", "USA",
"BRA", "DEU", "HKG", "BHR"), founded = c(1968L, 1946L, 1948L,
1881L, 1816L, 1890L), `from emerging` = c("True", "True", "True",
"True", "True", "True"), gdp = c(1.58e+11, 8.1e+12, 8.54e+11,
2.5e+12, 1.6e+11, 6.1e+09), gender = c("male", "female", "male",
"male", "male", "male"), industry = c("Money Management", "Money Management",
"Retail, Restaurant", "Technology-Medical", "Money Management",
"Consumer"), inherited = c("True", "True", "True", "True", "True",
"True"), name = c("Abdul Aziz Al-Sulaiman", "Abigail Johnson",
"Abilio dos Santos Diniz", "Adolf Merckle", "Adrian and John Swire",
"Ahmed Ali Kanoo"), rank = c(404L, 145L, 322L, 388L, 162L, 383L
), region = c("Middle East/North Africa", "North America", "Latin America",
"Europe", "East Asia", "Middle East/North Africa"), relationship = c("founder",
"relation", "relation", "relation", "relation", "relation"),
sector = c("construction", "investment banking", "retail",
"pharmaceuticals", "trading company", "shipping"), `was founder` = c("True",
"True", "True", "True", "True", "True"), `was political` = c("False",
"False", "False", "False", "False", "True"), wealth.type = c("self-made finance",
"inherited", "inherited", "inherited", "inherited", "inherited"
), worth_billions = c(1, 2.5, 1.2, 1, 2.2, 1), year = c(1996L,
1996L, 1996L, 1996L, 1996L, 1996L)), row.names = c(NA, -6L
), spec = structure(list(cols = list(age = structure(list(), class = c("collector_integer",
"collector")), category = structure(list(), class = c("collector_character",
"collector")), citizenship = structure(list(), class = c("collector_character",
"collector")), company.name = structure(list(), class = c("collector_character",
"collector")), company.type = structure(list(), class = c("collector_character",
"collector")), `country code` = structure(list(), class = c("collector_character",
"collector")), founded = structure(list(), class = c("collector_integer",
"collector")), `from emerging` = structure(list(), class = c("collector_character",
"collector")), gdp = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_character",
"collector")), industry = structure(list(), class = c("collector_character",
"collector")), inherited = structure(list(), class = c("collector_character",
"collector")), name = structure(list(), class = c("collector_character",
"collector")), rank = structure(list(), class = c("collector_integer",
"collector")), region = structure(list(), class = c("collector_character",
"collector")), relationship = structure(list(), class = c("collector_character",
"collector")), sector = structure(list(), class = c("collector_character",
"collector")), `was founder` = structure(list(), class = c("collector_character",
"collector")), `was political` = structure(list(), class = c("collector_character",
"collector")), wealth.type = structure(list(), class = c("collector_character",
"collector")), `worth in billions` = structure(list(), class = c("collector_double",
"collector")), year = structure(list(), class = c("collector_integer",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector"))), class = "col_spec"), class = c("tbl_df", "tbl",
"data.frame"))
load("bil.RData")
print(bil)
# Renaming a few columns for spacing issues
colnames(bil)[21] <- "worth_billions"
colnames(bil)[6] <- "country_code"
# Finding where billionaires are oldest/youngest on average,
# ... then removing less than five observations
bil %>%
filter(!is.na(age)) %>%
select(age, worth_billions, country_code) %>%
group_by(age, worth_billions, country_code) %>%
mutate(count = n()) %>%
arrange(country_code) %>%
#spread(key = country_code, value = "USA") %>%
print()
I expect to find the country that has the oldest billionaires and youngest billionaires, excluding countries with fewer than five observations. Any help is appreciated!
After removing the NA elements in 'age' (filter), grouped by 'country_code' and filter out the groups having less than 5 billionaires, then summarise the mean of 'age' and slice the row having the maximum value for 'ageMean'
library(dplyr)
bil %>%
filter(!is.na(age)) %>%
group_by(country_code) %>%
filter(sum(worth_billions) > 1.0) > 5) %>%
summarise(ageMean = mean(age)) %>%
slice(which.max(ageMean))