How can I add columns to a data frame with a value determined by values in other columns? - r

I have a dataset on population in counties in the US. I want to add a column for what state the county is in and one for the county code. Both are already available in the dataset but "hid".
For instance, from the output we can see that the first observation says NAME = "Ada County, Idaho" and GEOID = "16001". I want one column with State = "Idaho" and one column with StateID = "16".
Thank you!
structure(list(NAME = c("Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho", "Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho"), GEOID = c("16001", "16001", "16001", "16001",
"16001", "16001"), year = c("2007", "2007", "2007", "2007", "2007",
"2007"), POP25 = c(205888, 205888, 205888, 205888, 205888, 205888
), EMPLOY25 = c(205888, 208506, 212770, 212272, 216058, 220856
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), groups = structure(list(NAME = "Ada County, Idaho", GEOID = "16001",
.rows = structure(list(1:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))

Perhaps this helps - remove the substring in 'NAME' till the , followed by one or more spaces (\\s+) to create the 'State' and the 'StateID' from the first two characters of 'GEOID' column using substr
library(dplyr)
library(stringr)
df1 %>%
ungroup %>%
mutate(State = str_remove(NAME, ".*,\\s+"),
StateID = substr(GEOID, 1, 2))

Here is an alternative using str_extract and str_sub:
library(dplyr)
library(stringr)
pattern <- paste(state.name, collapse="|")
df %>%
mutate(State = str_extract(NAME, pattern),
StateID = str_sub(GEOID, 1, 2))
NAME GEOID year POP25 EMPLOY25 State StateID
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 Ada County, ~ 16001 2007 205888 205888 Idaho 16
2 Ada County, ~ 16001 2007 205888 208506 Idaho 16
3 Ada County, ~ 16001 2007 205888 212770 Idaho 16
4 Ada County, ~ 16001 2007 205888 212272 Idaho 16
5 Ada County, ~ 16001 2007 205888 216058 Idaho 16
6 Ada County, ~ 16001 2007 205888 220856 Idaho 16

Related

Trying to calculate the Expected Value of an observation

I have a tibble and am trying to use values from two specific rows (Pinnacle book) to perform a calculation. The values of the calculation will be written to a new column. Here is the output of dput
structure(list(id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), start = structure(c(1676691000, 1676691000, 1676691000, 1676691000,
1676691000, 1676691000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), home = c("San José St Spartans", "San José St Spartans",
"San José St Spartans", "San José St Spartans", "San José St Spartans",
"San José St Spartans"), away = c("New Mexico Lobos", "New Mexico Lobos",
"New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos"
), team = c("San José St Spartans", "New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos"
), price = c(-140, 120, -140, 120, -138, 117), update = c("2023-02-18T00:24:43Z",
"2023-02-18T00:24:43Z", "2023-02-18T00:25:10Z", "2023-02-18T00:25:10Z",
"2023-02-18T00:25:04Z", "2023-02-18T00:25:04Z"), bep = c(0.58333,
0.45455, 0.58333, 0.45455, 0.57983, 0.46083), no_vig = c(-128.33333,
128.33333, -128.33333, 128.33333, -125.82353, 125.82353), no_vig_bep = c(0.56204,
0.43796, 0.56204, 0.43796, 0.55718, 0.44282), win = c(71.43,
120, 71.43, 120, 72.46, 117)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), team = c("New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos",
"San José St Spartans"), .rows = structure(list(2L, 1L,
4L, 3L, 6L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
The following is the calculation
-4.482633 = (.55718 * 71.43) - (.44282 * 100)
The values in the calculation above correspond with the following variables
-4.482633 = Expected Value I am trying to derive
.55718 = "no_vig_bep" of Pinnacle
71.43 = "win" of observation 1
.44282 = 1 - "no_vig_bep" of Pinnacle or the last row
100 = a set amount
I Would then like to calculate the other side of the odds as follow
-2.5796 = (.44282 * 120.00) - (.55718 * 100)
The ultimate goal is to use the values of the Pinnacle book to perform the above calculation against all other books. The EV will be written to a new column.
Included additional id for further clarification
structure(list(id = c("073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70"
), book = c("Bovada", "Pinnacle", "MyBookie.ag", "MyBookie.ag",
"Pinnacle", "Bovada"), home = c("Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos", "Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos"), away = c("Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals", "Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals"), team = c("Western
Michigan Broncos",
"Ball State Cardinals", "Western Michigan Broncos", "Ball State
Cardinals",
"Western Michigan Broncos", "Ball State Cardinals"), price = c(-185,
-143, -142, 100, 108, 140), bep = c(0.64912, 0.58848, 0.58678,
0.5, 0.48077, 0.41667), no_vig = c(-155.78947, -122.40329, -117.35537,
117.35537, 122.40329, 155.78947), no_vig_bep = c(0.60905, 0.55037,
0.53992, 0.46008, 0.44963, 0.39095), win = c(54.05, 69.93, 70.42,
100, 108, 140), EV_1 = c(-15.2155015, -6.47562589999999,
-6.20594459999999,
-10.074, -6.47696000000001, 7.91119999999999)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups =
structure(list(
book = c("Bovada", "MyBookie.ag", "Pinnacle"), .rows = structure(list(
c(1L, 6L), 3:4, c(2L, 5L)), ptype = integer(0), class =
c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE))
Edit1: Maybe purrr can help you here:
library(purrr)
no_vig_pin_list <- df |>
ungroup() |>
split(df$id) |>
map(~.x |> filter(book == "Pinnacle") |> pull(no_vig_bep))
df |>
ungroup() |>
group_split(id) |>
purrr::map2_dfr(no_vig_pin_list, ~ .x |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(.y[1] * win)- ((1-.y[1])*100),
(.y[2] * win)- ((1-.y[2])*100)))) |>
select(EV_1)
A tibble: 8 × 2
# Groups: book [2]
book EV_1
<chr> <dbl>
1 BetUS -5.03
2 BetUS -1.10
3 Pinnacle -4.07
4 Pinnacle -4.07
5 BetUS -4.48
6 BetUS -2.58
7 Pinnacle -3.91
8 Pinnacle -3.91
Maybe this helps, I am not quite sure that I understood what you are trying to achieve.
library(dplyr)
no_vig_pin <- df |>
filter(book == "Pinnacle") |>
pull(no_vig_bep)
df |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(no_vig_pin[1] * win)- ((1-no_vig_pin[1])*100),
(no_vig_pin[2] * win)- ((1-no_vig_pin[2])*100))) |>
select(EV_1)
Output:
# A tibble: 6 × 2
# Groups: book [3]
book EV_1
<chr> <dbl>
1 BetUS -4.48
2 BetUS -2.58
3 Bovada -4.48
4 Bovada -2.58
5 Pinnacle -3.91
6 Pinnacle -3.91

retain only rows and columns that match with a string vector

I have a large DF with certain columns that have a vector of character values as below. The number of columns varies from dataset to dataset as well as the number of character vectors it holds also varies.
ID Country1 Country2 Country3
1 1 Argentina, Japan,USA,Poland, Argentina,USA Pakistan
2 2 Colombia, Mexico,Uruguay,Dutch Mexico,Uruguay Afganisthan
3 3 Argentina, Japan,USA,NA Japan Khazagistan
4 4 Colombia, Mexico,Uruguay,Dutch Colombia, Dutch North Korea
5 5 India, China China Iran
Would like to match them one-to-one with another string vector as below
vals_to_find <-c("Argentina","USA","Mexico")
If, a column/row matches to anyone of the strings passed would like to retain that column and row. Remove duplicates, and finally remove those values that do not match.
the desired output is as follows
ID Countries.found
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
data
dput(df)
structure(list(ID = 1:5, Country1 = c("Argentina, Japan,USA,Poland,",
"Colombia, Mexico,Uruguay,Dutch", "Argentina, Japan,USA,NA",
"Colombia, Mexico,Uruguay,Dutch", "India, China"), Country2 = c("Argentina,USA",
"Mexico,Uruguay", "Japan", "Colombia, Dutch", "China"), Country3 = c("Pakistan",
"Afganisthan", "Khazagistan", "North Korea", "Iran")), class = "data.frame", row.names = c(NA,
-5L))
dput(df_out)
structure(list(ID = 1:4, Countries.found = c("Argentina, USA",
"Mexico", "Argentina, USA", "Mexico")), class = "data.frame", row.names = c(NA,
-4L))
Instead of a each column as a vector, if the file is read as one value per column. Then, was able do it as below
dput(df_out)
structure(list(ID = 1:5, X1 = c("Argentina", "Colombia", "Argentina",
"Colombia", "India"), X2 = c("Japan", "Mexico", "Japan", "Mexico",
"China"), X3 = c("USA", "Uruguay", "USA", "Uruguay", NA), X4 = c("Poland",
"Dutch", NA, "Dutch", NA), X5 = c("Argentina", "Mexico", "Japan",
"Colombia", "China"), X6 = c("USA", "Uruguay", NA, "Dutch", NA
), X7 = c("Pakistan", "Afganisthan", "Khazagistan", "North Korea",
"Iran")), class = "data.frame", row.names = c(NA, -5L))
df_out %>%
dplyr::select(
where(~ !all(is.na(.x)))
) %>%
dplyr::select(c(1, where(~ any(.x %in% vals_to_find)))) %>%
dplyr::mutate(dplyr::across(
tidyselect::starts_with("X"),
~ vals_to_find[match(., vals_to_find)]
)) %>%
tidyr::unite("countries_found", tidyselect::starts_with("X"),
sep = " | ", remove = TRUE, na.rm = TRUE
)
Output
ID countries_found
1 1 Argentina | USA | Argentina | USA
2 2 Mexico | Mexico
3 3 Argentina | USA
4 4 Mexico
unite the "Country" columns, then create a long vector by separating the values into rows, get all distinct values per ID, filter only those who are in vals_to_find, and summarise each countries.found toString.
library(tidyr)
library(dplyr)
df %>%
unite("Country", starts_with("Country"), sep = ",") %>%
separate_rows(Country) %>%
distinct(ID, Country) %>%
filter(Country %in% vals_to_find) %>%
group_by(ID) %>%
summarise(Countries.found = toString(Country))
output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
We may use
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("Country"),
~ str_extract_all(.x, str_c(vals_to_find, collapse = "|")))) %>%
pivot_longer(cols = -ID, names_to = NULL,
values_to = 'Countries.found') %>%
unnest(Countries.found) %>%
distinct %>%
group_by(ID) %>%
summarise(Countries.found = toString(Countries.found))
-output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico

Extracting and evaluating words in a text string against another dataset

I have two sets of data that I will be evaluating against one another. A heavily reduced example looks like this:
library(dplyr)
library(tidyverse)
library(sqldf)
library(dbplyr)
library(httr)
library(purrr)
library(jsonlite)
library(magrittr)
library(tidyr)
library(tidytext)
people_records_ex <- structure(list(id = c(123L, 456L, 789L), name = c("Anna Wilson",
"Jeff Smith", "Craig Mills"), biography = c("Student at Ohio State University. Class of 2024.",
"Second year law student at Stanford. Undergrad at William & Mary",
"University of North Texas Volleyball!")), class = "data.frame", row.names = c(NA,
-3L))
college_records_ex <- structure(list(college_id = c(234L, 567L, 891L, 345L), college_name = c("Ohio State University",
"Stanford", "William & Mary", "University of North Texas"), college_city = c("Columbus",
"Stanford", "Williamsburg", "Denton"), college_state = c("OH",
"CA", "VA", "TX")), class = "data.frame", row.names = c(NA, -4L
))
I am trying to create a match against the contents of the biography text string in people_records_ex against college_name in college_records_ex so the final output will look like this:
final_records_ex <- structure(list(id = c(123L, 456L, 456L, 789L), name = c("Anna Wilson",
"Jeff Smith", "Jeff Smith", "Craig Mills"), college_name = c("Ohio State University",
"Stanford", "William & Mary", "University of North Texas"), college_city = c("Columbus",
"Stanford", "Williamsburg", "Denton"), college_state = c("OH",
"CA", "VA", "TX")), class = "data.frame", row.names = c(NA, -4L
))
Or to provide a more visual example of the final output I'm expecting:
But when I run the following code, it produces zero results, which is not correct:
college_extract <- people_records_ex %>%
left_join(college_records_ex, by = c("biography" = "college_name")) %>%
filter(!is.na(college_state)) %>% dplyr::select(id, name, college_name, college_city, college_state) %>% distinct()
What am I doing incorrectly and what would the correct version look like?
Here's a very tidy and straightforward solution with fuzzy_join:
library(fuzzyjoin)
library(stringr)
library(dplyr)
fuzzy_join(
people_records_ex, college_records_ex,
by = c("biography" = "college_name"),
match_fun = str_detect,
mode = "left"
) %>%
select(-biography)
id name college_id college_name college_city college_state
1 123 Anna Wilson 234 Ohio State University Columbus OH
2 456 Jeff Smith 567 Stanford Stanford CA
3 456 Jeff Smith 891 William & Mary Williamsburg VA
4 789 Craig Mills 345 University of North Texas Denton TX
Assuming the college names in the biographies are spelled out exactly as they appear in the colleges table and the datasets are relatively small, all matches can be generated with a regex of all college names as follows
library(dplyr)
people_records_ex <- structure(list(id = c(123L, 456L, 789L), name = c(
"Anna Wilson",
"Jeff Smith", "Craig Mills"
), biography = c(
"Student at Ohio State University. Class of 2024.",
"Second year law student at Stanford. Undergrad at William & Mary",
"University of North Texas Volleyball!"
)), class = "data.frame", row.names = c(
NA,
-3L
)) %>% tibble::tibble()
college_records_ex <- structure(list(college_id = c(234L, 567L, 891L, 345L), college_name = c(
"Ohio State University",
"Stanford", "William & Mary", "University of North Texas"
), college_city = c(
"Columbus",
"Stanford", "Williamsburg", "Denton"
), college_state = c(
"OH",
"CA", "VA", "TX"
)), class = "data.frame", row.names = c(NA, -4L)) %>%
tibble::tibble()
# join college names in a regex pattern
colleges_regex <- paste0(college_records_ex$college_name, collapse = "|")
colleges_regex
#> [1] "Ohio State University|Stanford|William & Mary|University of North Texas"
# match all against bio, giving a list-column of matches
people_records_ex %>%
mutate(matches = stringr::str_match_all(biography, colleges_regex))
#> # A tibble: 3 × 4
#> id name biography matches
#> <int> <chr> <chr> <list>
#> 1 123 Anna Wilson Student at Ohio State University. Class of 2024. <chr[…]>
#> 2 456 Jeff Smith Second year law student at Stanford. Undergrad at … <chr[…]>
#> 3 789 Craig Mills University of North Texas Volleyball! <chr[…]>
# unnest the list column wider to give 1 row per person per match
people_records_ex %>%
mutate(matches = stringr::str_match_all(biography, colleges_regex)) %>%
tidyr::unnest_longer(matches)
#> # A tibble: 4 × 4
#> id name biography match…¹
#> <int> <chr> <chr> <chr>
#> 1 123 Anna Wilson Student at Ohio State University. Class of 2024. Ohio S…
#> 2 456 Jeff Smith Second year law student at Stanford. Undergrad at W… Stanfo…
#> 3 456 Jeff Smith Second year law student at Stanford. Undergrad at W… Willia…
#> 4 789 Craig Mills University of North Texas Volleyball! Univer…
#> # … with abbreviated variable name ¹​matches[,1]
Created on 2022-10-26 with reprex v2.0.2
This may be joined back to the college table such that it is annotated with college info.
In base R you can do:
do.call(rbind, lapply(college_records_ex$college_name,
\(x) people_records_ex[grep(x, people_records_ex$biography),1:2])) |>
cbind(college_records_ex[-1])
This does some matching and I subsetted the first two columns which are the id and name, cbinding it with the second data.frame getting rid of the first column
id name college_name college_city college_state
1 123 Anna Wilson Ohio State University Columbus OH
2 456 Jeff Smith Stanford Stanford CA
21 456 Jeff Smith William & Mary Williamsburg VA
3 789 Craig Mills University of North Texas Denton TX

How to remove rows that have repeated elements?

I have a dataframe that looks like this (but for every US county)
county
state
neighbor_county
neighbor_state
Baldwin County
AL
Clarke County
NA
Baldwin County
AL
Escambia County
FL
Baldwin County
AL
Mobile County
NA
Baldwin County
AL
Monroe County
NA
Barbour County
AL
Dale County
NA
Barbour County
AL
Henry County
NA
I am only interested in what states neighbor a county, so I want to remove repeated data to get this (step 1):
county
state
neighbor_state
Baldwin County
AL
NA
Baldwin County
AL
FL
Barbour County
AL
NA
And then change sort the dataframe like this (step 2):
county
state
neighbor_state_1
neighbor_state_2
neighbor_state_3
Baldwin County
AL
FL
NA
NA
Baldwin County
AL
NA
NA
NA
In step 1 I've deleted the "neighbor_county" column; however, I've not managed to remove the duplicates in the column "neighbor_state" for each distinct county. I have tried using the unique function but I can't seem to make it work such that it only removes duplicates of each distinct county.
For your first step you could drop the neighbour_county column and the use unique():
df$neighbor_county <- NULL
unique(df)
returns
county state neighbor_state
1 Baldwin_County AL NA
2 Baldwin_County AL FL
5 Barbour_County AL NA
An alternative using dplyr:
df %>%
select(-neighbor_county) %>%
distinct()
For your second step I make a suggestion:
library(tidyr)
library(dplyr)
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
returns
# A tibble: 2 x 6
county state neighbor_state_1 neighbor_state_2 neighbor_state_3 neighbor_state_4
<chr> <chr> <chr> <chr> <chr> <chr>
1 Baldwin_County AL 'NA' 'FL' 'NA' 'NA'
2 Barbour_County AL 'NA' 'NA' NA NA
but I'm not sure, if this is what you are looking for.
For removing doubled NA-values, you could use
df %>%
group_by(county) %>%
select(-neighbor_county) %>%
distinct() %>%
mutate(n = row_number()) %>%
pivot_wider(names_from=n, names_prefix="neighbor_state_", values_from=neighbor_state) %>%
ungroup()
Data
structure(list(county = c("Baldwin_County", "Baldwin_County",
"Baldwin_County", "Baldwin_County", "Barbour_County", "Barbour_County"
), state = c("AL", "AL", "AL", "AL", "AL", "AL"), neighbor_county = c("Clarke_County",
"Escambia_County", "Mobile_County", "Monroe_County", "Dale_County",
"Henry_County"), neighbor_state = c("'NA'", "'FL'", "'NA'", "'NA'",
"'NA'", "'NA'")), problems = structure(list(row = 6L, col = "neighbor_state",
expected = "", actual = "embedded null", file = "literal data"), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), class = "data.frame", row.names = c(NA,
-6L), spec = structure(list(cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), neighbor_county = structure(list(), class = c("collector_character",
"collector")), neighbor_state = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Gather data in r with multiple columns

I have some data which I am trying to use tidy R and pivot longer function in R to get the out put as mentioned below. But I am not able to do it, I am getting Data
I have data in this format. ( with many other column names )
Country State Year 1 Population 1 Year 2 Population2
U.S.A IL 2009 20000 2010 30000
U.S.A VA 2009 30000 2010 40000
I want to get data in this format.
Country State Year Population
U.S.A IL 2009 20000
U.S.A IL 2010 30000
U.S.A VA 2009 30000
U.S.A VA 2010 40000
I am able to do it only for on column, but not able to pass other column likes like population
My code is below.
file1<-file %>%
pivot_longer(
cols = contains("Year"),
names_sep = "_",
names_to = c(".value", "repeat"),
)
I was able to make it work on Tidyverse.
library(tidyverse)
file<-read_excel("peps300.xlsx")
names(file)<-str_replace_all(names(file), c("Year " = "Year_" , "Num " = "Num_", "DRate " = "DRate_" , "PRate " = "PRate_", "Denom " = "Denom_"))
file<-file %>%
pivot_longer(
cols = c(contains("Year"),contains("Num"),contains("DRate"),contains("PRate"),contains("Denom")),
names_sep = "_",
names_to = c(".value", "repeat")
)
An option would be to specify the cols that starts_with "Population" or "Year"
library(dplyr)
df1 %>%
pivot_longer(cols = c(starts_with("Population"), starts_with("Year")),
names_to = c(".value", "group"), names_pattern = "(.*)_(.*)")
# A tibble: 4 x 5
# Country State group Population Year
# <chr> <chr> <chr> <int> <int>
#1 U.S.A IL 1 20000 2009
#2 U.S.A IL 2 30000 2010
#3 U.S.A VA 1 30000 2009
#4 U.S.A VA 2 40000 2010
data
df1 <- structure(list(Country = c("U.S.A", "U.S.A"), State = c("IL",
"VA"), Year_1 = c(2009L, 2009L), Population_1 = c(20000L, 30000L
), Year_2 = c(2010L, 2010L), Population_2 = c(30000L, 40000L)),
class = "data.frame", row.names = c(NA,
-2L))
df %>%
pivot_longer(
-c(Country,State),
names_to = c(".value","group"),
names_pattern = "(.+)_(.+)"
)
# A tibble: 4 x 5
Country State group Year Population
<chr> <chr> <chr> <chr> <chr>
1 U.S.A IL 1 2009 20000
2 U.S.A IL 2 2010 30000
3 U.S.A VA 1 2009 30000
4 U.S.A VA 2 2010 40000
You can then drop the group if you don't need it.
And, to do this, you will need to clean your column names first. Make sure they all follow the same pattern and words are connected with a single space or a single underscore.
df <- structure(list(Country = c("U.S.A", "U.S.A"), State = c("IL",
"VA"), Year_1 = c("2009", "2009"), Population_1 = c("20000",
"30000"), Year_2 = c("2010", "2010"), Population_2 = c("30000",
"40000")), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L))

Resources