Issue with pivot_longer command - r

I am trying to convert a long dataset to a wide set set using pivot longer, the column headers are "Program ID" and ‘Participant_Count_22’, ‘Participant_Count_21’, ‘Participant_Count_20’, ‘Participant_Count_19’ for four years 2019-2022.
program_tot %>% pivot_longer(cols=c(‘Participant_Count_22’, ‘Participant_Count_21’, ‘Participant_Count_20’, ‘Participant_Count_19’),
names_to='year',
values_to='Participant_Count')
I am getting an unexpected input error message. Any tips?

You seem to have fancy single quotes around your column names. Note carefully the difference between quotes in ‘Participant_Count_22’, which R doesn't recognise as a string, and 'Participant_Count_22', which it does recognise as a string.
In any case, you will save yourself some time and complexity if you use the selection helpers to select columns. In your case you could use contains("Participant_Count")
library(tidyverse)
program_tot %>%
pivot_longer(cols = contains("Participant_Count"),
names_to = 'year',
values_to = 'Participant_Count') %>%
mutate(year = as.numeric(paste0("20", sub("Participant_Count_", "", year))))
#> # A tibble: 20 x 3
#> Program.ID year Participant_Count
#> <fct> <dbl> <dbl>
#> 1 1 2022 5
#> 2 1 2021 5
#> 3 1 2020 6
#> 4 1 2019 2
#> 5 2 2022 7
#> 6 2 2021 2
#> 7 2 2020 8
#> 8 2 2019 5
#> 9 3 2022 4
#> 10 3 2021 8
#> 11 3 2020 1
#> 12 3 2019 7
#> 13 4 2022 3
#> 14 4 2021 1
#> 15 4 2020 3
#> 16 4 2019 3
#> 17 5 2022 5
#> 18 5 2021 3
#> 19 5 2020 2
#> 20 5 2019 1
Data used
program_tot <- data.frame('Program ID' = factor(1:5),
Participant_Count_22 = c(5, 7, 4, 3, 5),
Participant_Count_21 = c(5, 2, 8, 1, 3),
Participant_Count_20 = c(6, 8, 1, 3, 2),
Participant_Count_19 = c(2, 5, 7, 3, 1))
Created on 2022-09-28 with reprex v2.0.2

Related

creating a dummy variable with consecutive cases

I have a similar problem like this one:
How can I create a dummy variable over consecutive values by group id?
the difference is: as soon I have the Dummy = 1 I want my dummy for the rest of my group (ID) beeing 1 since year is in descending order. So for example, out of df1:
df1 <-data.frame(ID = rep(seq(1:3), each = 4),
year = rep(c(2014, 2015, 2016, 2017),3),
value = runif(12, min = 0, max = 25),
Dummy = c(0,0,1,0 ,0,1,0,1, 1,0,0,0))
shall be :
df2 <- data.frame(ID = rep(seq(1:4), 3),
year = rep(c(2014, 2015, 2016, 2017),3),
value = runif(12, min = 0, max = 25),
Dummy = c(0,0,1,1 ,0,1,1, 1, 1,1,1,1))
I've tried something like that (and some others) but that failed:
df2<- df1%>% group_by(ID) %>% arrange(ID , year) %>%
mutate(treated = case_when(Dummy == 1 ~ 1,
lag(Dummy, n= unique(n()), default = 0) == 1 ~ 1))
If your input data is as below then we can just use cummax():
library(dplyr)
df1 <-data.frame(ID = rep(seq(1:3), each = 4),
year = rep(c(2014, 2015, 2016, 2017),3),
value = runif(12, min = 0, max = 25),
Dummy = c(0,0,1,0 ,0,1,0,1, 1,0,0,0))
df1
#> ID year value Dummy
#> 1 1 2014 14.144996 0
#> 2 1 2015 20.621603 0
#> 3 1 2016 8.325170 1
#> 4 1 2017 21.725028 0
#> 5 2 2014 11.894383 0
#> 6 2 2015 13.445744 1
#> 7 2 2016 3.332338 0
#> 8 2 2017 2.984941 1
#> 9 3 2014 17.551266 1
#> 10 3 2015 5.250556 0
#> 11 3 2016 11.062577 0
#> 12 3 2017 20.169439 0
df1 %>%
group_by(ID) %>%
mutate(Dummy = cummax(Dummy))
#> # A tibble: 12 x 4
#> # Groups: ID [3]
#> ID year value Dummy
#> <int> <dbl> <dbl> <dbl>
#> 1 1 2014 14.1 0
#> 2 1 2015 20.6 0
#> 3 1 2016 8.33 1
#> 4 1 2017 21.7 1
#> 5 2 2014 11.9 0
#> 6 2 2015 13.4 1
#> 7 2 2016 3.33 1
#> 8 2 2017 2.98 1
#> 9 3 2014 17.6 1
#> 10 3 2015 5.25 1
#> 11 3 2016 11.1 1
#> 12 3 2017 20.2 1
Created on 2022-10-14 by the reprex package (v2.0.1)

How to use pivot_longer() in R to separate columns into multiple rows by category?

Here is some fictional data:
tibble(fruit = rep(c("apple", "pear", "orange"), each = 3),
size = rep(c("big", "medium", "small"), times = 3),
# summer stock
shopA_summer_wk1 = abs(round(rnorm(9, 10, 5), 0)),
shopA_summer_wk2 = abs(round(rnorm(9, 10, 5), 0)),
shopB_summer_wk1 = abs(round(rnorm(9, 10, 5), 0)),
shopB_summer_wk2 = abs(round(rnorm(9, 10, 5), 0)),
shopC_summer_wk1 = abs(round(rnorm(9, 10, 5), 0)),
shopC_summer_wk2 = abs(round(rnorm(9, 10, 5), 0)),
# winter stock
shopA_winter_wk1 = abs(round(rnorm(9, 8, 4), 0)),
shopA_winter_wk2 = abs(round(rnorm(9, 8, 4), 0)),
shopA_winter_wk3 = abs(round(rnorm(9, 8, 4), 0)),
shopB_winter_wk1 = abs(round(rnorm(9, 8, 4), 0)),
shopB_winter_wk2 = abs(round(rnorm(9, 8, 4), 0)),
shopB_winter_wk3 = abs(round(rnorm(9, 8, 4), 0)),
shopC_winter_wk1 = abs(round(rnorm(9, 8, 4), 0)),
shopC_winter_wk2 = abs(round(rnorm(9, 8, 4), 0)),
shopC_winter_wk3 = abs(round(rnorm(9, 8, 4), 0)))
Some data is collected for 3 shops (A, B, C) across 2 weeks in the summer and 3 weeks in the winter. The data collected is the number of fruits (apple, pear, orange) per size (big, medium, small) the shop had in stock on that particular week.
Here are the first 6 rows of of the dataset:
# fruit size shopA_summer_wk1 shopA_summer_wk2 shopB_summer_wk1 shopB_summer_wk2 shopC_summer_wk1 shopC_summer_wk2 shopA_winter_wk1 shopA_winter_wk2 shopA_winter_wk3
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 apple big 9 12 12 16 15 5 14 4 0
# 2 apple medium 21 16 16 1 12 11 8 8 9
# 3 apple small 10 6 18 18 22 12 4 2 0
# 4 pear big 13 7 4 12 13 6 10 6 2
# 5 pear medium 13 12 8 0 8 5 11 7 3
# 6 pear small 16 18 4 3 13 8 7 5 0
I would like to use the pivot_longer() function in R to restructure this dataset. Given that there are quite a few group categories I'm having difficulty in writing the code for this.
I would like it to look something like the following:
I would greatly appreciate any input :)
Using the names_pattern argument, we can do:
pivot_longer(df, c(-fruit, -size), names_pattern = '(^.*)_wk(.*$)',
names_to = c('Shop_season', 'week'))
#> # A tibble: 135 x 5
#> fruit size Shop_season week value
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 apple big shopA_summer 1 11
#> 2 apple big shopA_summer 2 8
#> 3 apple big shopB_summer 1 4
#> 4 apple big shopB_summer 2 24
#> 5 apple big shopC_summer 1 9
#> 6 apple big shopC_summer 2 10
#> 7 apple big shopA_winter 1 9
#> 8 apple big shopA_winter 2 12
#> 9 apple big shopA_winter 3 5
#> 10 apple big shopB_winter 1 5
#> # ... with 125 more rows
You might also want to separate shop and season, since these are really two different variables:
pivot_longer(df, c(-fruit, -size), names_pattern = '(^.*)_wk(.*$)',
names_to = c('Shop_season', 'week')) %>%
separate(Shop_season, into = c('shop', 'season'))
#> # A tibble: 135 x 6
#> fruit size shop season week value
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 apple big shopA summer 1 11
#> 2 apple big shopA summer 2 8
#> 3 apple big shopB summer 1 4
#> 4 apple big shopB summer 2 24
#> 5 apple big shopC summer 1 9
#> 6 apple big shopC summer 2 10
#> 7 apple big shopA winter 1 9
#> 8 apple big shopA winter 2 12
#> 9 apple big shopA winter 3 5
#> 10 apple big shopB winter 1 5
#> #... with 125 more rows
If data is dt, then
pivot_longer(
data = dt,
cols = -c(fruit:size),
names_to = c("shop_season", "week"),
names_pattern = "(.*)_(.*)"
)
Output:
# A tibble: 135 x 5
fruit size shop_season week value
<chr> <chr> <chr> <chr> <dbl>
1 apple big shopA_summer wk1 13
2 apple big shopA_summer wk2 12
3 apple big shopB_summer wk1 9
4 apple big shopB_summer wk2 9
5 apple big shopC_summer wk1 7
6 apple big shopC_summer wk2 17
7 apple big shopA_winter wk1 10
8 apple big shopA_winter wk2 17
9 apple big shopA_winter wk3 12
10 apple big shopB_winter wk1 8

How to add a new column with values specific to grouped variables

I'm new to R and have found similar solutions to my problem, but I'm struggling to apply these to my code. Please help...
These data are simplified, as the id variables are many:
df = data.frame(id = rep(c("a_10", "a_11", "b_10", "b_11"), each = 5),
site = rep(1:5, 4),
value = sample(1:20))
The aim is to add another column labelled "year" with values that are grouped by "id" but the true names are many - so I'm trying to simplify the code by using the ending digits.
I can use dplyr to split the dataframe into each id variable using this code (repeated for each id variable):
df %>%
select(site, id, value) %>%
filter(grepl("10$", id)) %>%
mutate(Year = "2010")`
Rather than using merge to re-combine the dataframes back into one, is there not a more simple method?
I tried modifying case_when with mutate as described in a previous answer:
[https://stackoverflow.com/a/63043920/12313457][1]
mutate(year = case_when(grepl(c("10$", "11$", id) == c("2010", "2011"))))
is something like this possible??
Thanks in advance
In case your id column has different string lengths you can use sub:
df %>%
mutate(Year = paste0("20", sub('^.*_(\\d+)$', '\\1', id)))
#> id site value Year
#> 1 a_10 1 2 2010
#> 2 a_10 2 7 2010
#> 3 a_10 3 16 2010
#> 4 a_10 4 10 2010
#> 5 a_10 5 11 2010
#> 6 a_11 1 5 2011
#> 7 a_11 2 13 2011
#> 8 a_11 3 14 2011
#> 9 a_11 4 6 2011
#> 10 a_11 5 12 2011
#> 11 b_10 1 17 2010
#> 12 b_10 2 1 2010
#> 13 b_10 3 4 2010
#> 14 b_10 4 15 2010
#> 15 b_10 5 9 2010
#> 16 b_11 1 8 2011
#> 17 b_11 2 20 2011
#> 18 b_11 3 19 2011
#> 19 b_11 4 18 2011
#> 20 b_11 5 3 2011
Created on 2022-04-21 by the reprex package (v2.0.1)
You can use substr to get the final two digits of id and then paste0 this to "20" to recreate the year.
df |> dplyr::mutate(Year = paste0("20", substr(id, 3, 4)))
#> id site value Year
#> 1 a_10 1 5 2010
#> 2 a_10 2 12 2010
#> 3 a_10 3 9 2010
#> 4 a_10 4 7 2010
#> 5 a_10 5 13 2010
#> 6 a_11 1 3 2011
#> 7 a_11 2 4 2011
#> 8 a_11 3 16 2011
#> 9 a_11 4 2 2011
#> 10 a_11 5 6 2011
#> 11 b_10 1 19 2010
#> 12 b_10 2 14 2010
#> 13 b_10 3 15 2010
#> 14 b_10 4 10 2010
#> 15 b_10 5 11 2010
#> 16 b_11 1 18 2011
#> 17 b_11 2 1 2011
#> 18 b_11 3 20 2011
#> 19 b_11 4 17 2011
#> 20 b_11 5 8 2011
Created on 2022-04-21 by the reprex package (v2.0.1)

Sum up with the next line into a new colum

I'm having some trouble on figuring out how to create a new column with the sum of 2 subsequent cells.
I have :
df1<- tibble(Years=c(1990, 2000, 2010, 2020, 2030, 2050, 2060, 2070, 2080),
Values=c(1,2,3,4,5,6,7,8,9 ))
Now, I want a new column where the first line is the sum of 1+2, the second line is the sum of 1+2+3 , the third line is the sum 1+2+3+4 and so on.
As 1, 2, 3, 4... are hipoteticall values, I need to measure the absolute growth from a decade to another in order to create later on a new variable to measure the percentage change from a decade to another.
library(tibble)
df1<- tibble(Years=c(1990, 2000, 2010, 2020, 2030, 2050, 2060, 2070, 2080),
Values=c(1,2,3,4,5,6,7,8,9 ))
library(slider)
library(dplyr, warn.conflicts = F)
df1 %>%
mutate(xx = slide_sum(Values, after = 1, before = Inf))
#> # A tibble: 9 x 3
#> Years Values xx
#> <dbl> <dbl> <dbl>
#> 1 1990 1 3
#> 2 2000 2 6
#> 3 2010 3 10
#> 4 2020 4 15
#> 5 2030 5 21
#> 6 2050 6 28
#> 7 2060 7 36
#> 8 2070 8 45
#> 9 2080 9 45
Created on 2021-08-12 by the reprex package (v2.0.0)
Assuming the last row is to be repeated. Otherwise the fill part can be skipped.
library(dplyr)
library(tidyr)
df1 %>%
mutate(x = lead(cumsum(Values))) %>%
fill(x)
# Years Values x
# <dbl> <dbl> <dbl>
# 1 1990 1 3
# 2 2000 2 6
# 3 2010 3 10
# 4 2020 4 15
# 5 2030 5 21
# 6 2050 6 28
# 7 2060 7 36
# 8 2070 8 45
# 9 2080 9 45
Using base R
v1 <- cumsum(df1$Values)[-1]
df1$new <- c(v1, v1[length(v1)])
You want the cumsum() function. Here are two ways to do it.
### Base R
df1$cumsum <- cumsum(df1$Values)
### Using dplyr
library(dplyr)
df1 <- df1 %>%
mutate(cumsum = cumsum(Values))
Here is the output in either case.
df1
# A tibble: 9 x 3
Years Values cumsum
<dbl> <dbl> <dbl>
1 1990 1 1
2 2000 2 3
3 2010 3 6
4 2020 4 10
5 2030 5 15
6 2050 6 21
7 2060 7 28
8 2070 8 36
9 2080 9 45
A data.table option
> setDT(df)[, newCol := shift(cumsum(Values), -1, fill = sum(Values))][]
Years Values newCol
1: 1990 1 3
2: 2000 2 6
3: 2010 3 10
4: 2020 4 15
5: 2030 5 21
6: 2050 6 28
7: 2060 7 36
8: 2070 8 45
9: 2080 9 45
or a base R option following a similar idea
transform(
df,
newCol = c(cumsum(Values)[-1],sum(Values))
)

Performing a dplyr full_join without a common variable to blend data frames

Using the dplyr full_join() operation, I am trying to perform the equivalent of a basic merge() operation in which no common variables exist (unable to satisfy the "by=" argument). This will blend two data frames and return all possible combinations.
However, the current full_join() function requires a common variable. I am unable to locate another dplyr function that can help with this. How can I perform this operation using functions specific to the dplyr library?
df_a = data.frame(department=c(1,2,3,4))
df_b = data.frame(period=c(2014,2015,2016,2017))
#This works as desired
big_df = merge(df_a,df_b)
#I'd like to perform the following in a much bigger operation:
big_df = dplyr::full_join(df_a,df_b)
#Error: No common variables. Please specify `by` param.
You can use crossing from tidyr:
crossing(df_a,df_b)
department period
1 1 2014
2 1 2015
3 1 2016
4 1 2017
5 2 2014
6 2 2015
7 2 2016
8 2 2017
9 3 2014
10 3 2015
11 3 2016
12 3 2017
13 4 2014
14 4 2015
15 4 2016
16 4 2017
If there are duplicate rows, crossing doesn't give the same result as merge.
Instead use full_join with by = character() to perform a cross-join which generates all combinations of df_a and df_b.
library("tidyverse") # version 1.3.2
# Add duplicate rows for illustration.
df_a <- tibble(department = c(1, 2, 3, 3))
df_b <- tibble(period = c(2014, 2015, 2016, 2017))
merge doesn't de-duplicate.
df_a_merge_b <- merge(df_a, df_b)
df_a_merge_b
#> department period
#> 1 1 2014
#> 2 2 2014
#> 3 3 2014
#> 4 3 2014
#> 5 1 2015
#> 6 2 2015
#> 7 3 2015
#> 8 3 2015
#> 9 1 2016
#> 10 2 2016
#> 11 3 2016
#> 12 3 2016
#> 13 1 2017
#> 14 2 2017
#> 15 3 2017
#> 16 3 2017
crossing drops duplicate rows.
df_a_crossing_b <- crossing(df_a, df_b)
df_a_crossing_b
#> # A tibble: 12 × 2
#> department period
#> <dbl> <dbl>
#> 1 1 2014
#> 2 1 2015
#> 3 1 2016
#> 4 1 2017
#> 5 2 2014
#> 6 2 2015
#> 7 2 2016
#> 8 2 2017
#> 9 3 2014
#> 10 3 2015
#> 11 3 2016
#> 12 3 2017
full_join doesn't remove duplicates either.
df_a_full_join_b <- full_join(df_a, df_b, by = character())
df_a_full_join_b
#> # A tibble: 16 × 2
#> department period
#> <dbl> <dbl>
#> 1 1 2014
#> 2 1 2015
#> 3 1 2016
#> 4 1 2017
#> 5 2 2014
#> 6 2 2015
#> 7 2 2016
#> 8 2 2017
#> 9 3 2014
#> 10 3 2015
#> 11 3 2016
#> 12 3 2017
#> 13 3 2014
#> 14 3 2015
#> 15 3 2016
#> 16 3 2017
packageVersion("tidyverse")
#> [1] '1.3.2'
Created on 2023-01-13 with reprex v2.0.2

Resources