Pivot wider to one row in R - r

Here is the sample code that I am using
library(dplyr)
naics <- c("000000","000000",123000,123000)
year <- c(2020,2021,2020,2021)
January <- c(250,251,6,9)
February <- c(252,253,7,16)
March <- c(254,255,8,20)
sample2 <- data.frame (naics, year, January, February, March)
Here is the intended result
Jan2020 Feb2020 March2020 Jan2021 Feb2021 March2021
000000 250 252 254 251 253 255
123000 6 7 8 9 16 20
Is this something that is done with pivot_wider or is it more complex?

We use pivot_wider by selecting the values_from with the month column, names_from as 'year' and then change the column name format in names_glue and if needed convert the 'naics' to row names with column_to_rownames (from tibble)
library(tidyr)
library(tibble)
pivot_wider(sample2, names_from = year, values_from = January:March,
names_glue = "{substr(.value, 1, 3)}{year}")%>%
column_to_rownames('naics')
-output
Jan2020 Jan2021 Feb2020 Feb2021 Mar2020 Mar2021
000000 250 251 252 253 254 255
123000 6 9 7 16 8 20

With reshape function from BaseR,
reshape(sample2, dir = "wide", sep="",
idvar = "naics",
timevar = "year",
new.row.names = unique(naics))[,-1]
# January2020 February2020 March2020 January2021 February2021 March2021
# 000000 250 252 254 251 253 255
# 123000 6 7 8 9 16 20

This takes a longer route than #akrun's answer. I will leave this here in case it may help with more intuition on the steps being taken. Otherwise, #akrun's answer is more resource efficient.
sample2 %>%
tidyr::pivot_longer(-c(naics, year), names_to = "month",
values_to = "value") %>%
mutate(Month=paste0(month, year)) %>%
select(-year, - month) %>%
tidyr::pivot_wider(names_from = Month,values_from = value)
# A tibble: 2 x 7
naics January2020 February2020 March2020 January2021 February2021
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 000000 250 252 254 251 253
2 123000 6 7 8 9 16
# ... with 1 more variable: March2021 <dbl>

Related

Group and add variable of type stock and another type in a single step?

I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205

Manipulate/rearrange intervals in R columns

I have a data frame in R with a column (sni) with numbers that looks like this etc
bransch
sni
name
15
name
15
name
16-18
somename
16-18
name
241-3
someothername
241-3
where I have to transform/create a new column with just one number per row, i.e. no intervals so for example it should be a new row for all individual values in the intervals and look like this
bransch
sni
name
15
name
15
name
16
name
17
name
18
somename
16
somename
17
somename
18
name
241
name
242
name
243
someothername
241
someothername
242
someothername
243
I'm a bit unsure which function can do this the best way, or if someone has stumble upon a similar problem/solution. Currently I have tried to split the sni column (where the "-" starts) into two new ones, but then I'm a bit stuck since I will have many rows in one of the new columns without any values etc. Also the column is a character at the moment.
Any advice?
Sincerely,
TS
I took a while. Here is tidyverse approach:
library(dplyr)
library(tidyr)
df %>%
separate(sni, c("x", "y")) %>%
as_tibble() %>%
mutate(y = ifelse(as.numeric(y)<=9, paste0(substr(x, 1, nchar(x)-1), y),
y)) %>%
mutate(id = row_number()) %>%
pivot_longer(c(x,y)) %>%
mutate(value = as.numeric(value)) %>%
group_by(col2 =as.integer(gl(n(),2,n()))) %>%
fill(value, .direction = "down") %>%
complete(value = seq(first(value), last(value), by=1)) %>%
fill(bransch, .direction = "down") %>%
select(bransch, sni=value) %>%
group_by(col2, sni) %>%
slice(1)
col2 bransch sni
<int> <chr> <dbl>
1 1 name 15
2 2 name 15
3 3 name 16
4 3 name 17
5 3 name 18
6 4 somename 16
7 4 somename 17
8 4 somename 18
9 5 name 241
10 5 name 242
11 5 name 243
12 6 someothername 241
13 6 someothername 242
14 6 someothername 243
Let's try this.
Assume only three digits interval would have the pattern of 123-5 instead of 123-125, therefore in the ifelse, we modify this special pattern (e.g. 123-5) of interval into more regular one (123-125). Then separate the interval to individual integer using separate_rows.
We can then use complete to fill in the missing sequence in the interval.
library(tidyverse)
df %>%
group_by(sni,bransch) %>%
mutate(sni2 = ifelse(grepl("-", sni) & nchar(sub("-.*$", "", sni)) >= 3,
sub("^(\\d\\d)(.)-", "\\1\\2-\\1", sni),
sni)) %>%
separate_rows(sni2, convert = T) %>%
complete(sni2 = min(sni2):max(sni2)) %>%
ungroup() %>%
select(-sni)
# A tibble: 14 × 2
bransch sni2
<chr> <int>
1 name 15
2 name 15
3 name 16
4 name 17
5 name 18
6 somename 16
7 somename 17
8 somename 18
9 name 241
10 name 242
11 name 243
12 someothername 241
13 someothername 242
14 someothername 243
If I understood correctly
tmp=setNames(strsplit(df$sni,"-"),df$bransch)
tmp=unlist(
lapply(tmp,function(x){
x=as.numeric(x)
if (length(x)>1) {
if (x[1]<x[2]) {
seq(x[1],x[2],1)
} else {
seq(x[1],x[1]+x[2]-1,1)
}
} else {
x
}
})
)
data.frame(
"bransch"=names(tmp),
"sni"=tmp
)
bransch sni
1 name 15
2 name 15
3 name1 16
4 name2 17
5 name3 18
6 somename1 16
7 somename2 17
8 somename3 18
9 name1 241
10 name2 242
11 name3 243
12 someothername1 241
13 someothername2 242
14 someothername3 243
Using separate to get the start and end of the sequence, the we can map and unnest to get the result.
library (tidyverse)
data %>%
separate(
sni,
into = c("from", "to"),
fill = "right",
convert = TRUE) %>%
mutate(to = if_else(is.na(to), from, to)) %>%
transmute(
bransch,
sni = map2(from, to, `:`)) %>%
unnest_longer(sni)
# A tibble: 14 x 2
bransch sni
<chr> <int>
1 name 15
2 name 15
3 name 16
4 name 17
5 name 18
6 some name 16
7 some name 17
8 some name 18
9 name 241
10 name 242
11 name 243
12 someothername 241
13 someothername 242
14 someothername 243
Data
data <- tibble(
bransch = c("name","name","name","some name","name","someothername"),
sni =c("15","15","16-18","16-18","241-243","241-243"))

How to find duplicate dates within a row in R, and then replace associated values with the mean?

There are some similar questions, however I haven't been able to find the solution for my data:
ID <- c(27,46,72)
Gest1 <- c(27,28,29)
Sys1 <- c(120,123,124)
Dia1 <- c(90,89,92)
Gest2 <- c(29,28,30)
Sys2 <- c(122,130,114)
Dia2 <- c(89,78,80)
Gest3 <- c(32,29,30)
Sys3 <- c(123,122,124)
Dia3 <- c(90,88,89)
Gest4 <- c(33,30,32)
Sys4 <- c(124,123,128)
Dia4 <- c(94,89,80)
df.1 <- data.frame(ID,Gest1,Sys1,Dia1,Gest2,Sys2,Dia2,Gest3,Sys3,
Dia3,Gest4,Sys4,Dia4)
df.1
What I need to do is identify where there are any cases of gestational age duplicates (variables beginning with Gest), and then find the mean of the associated Sys and Dia variables.
Once the mean has been calculated, I need to replace the duplicates with just 1 Gest variable, and the mean of the Sys variable and the mean of the Dia variable. Everything after those duplicates should then be moved up the dataframe.
Here is what it should look like:
df.2
My real data has 25 Gest variables with 25 associated Sys variables and 25 association Dia variables.
Sorry if this is confusing! I've tried to write an ok question but it is my first time using stack overflow.
Thank you!!
This is easier to manage in long (and tidy) format.
Using tidyverse, you can use pivot_longer to put into long form. After grouping by ID and Gest you can substitute Sys and Dia values with the mean. If there are more than one Gest for a given ID it will then use the average.
Then, you can keep that row of data with slice. After grouping by ID, you can renumber after combining those with common Gest values.
library(tidyverse)
df.1 %>%
pivot_longer(cols = -ID, names_to = c(".value", "number"), names_pattern = "(\\w+)(\\d+)") %>%
group_by(ID, Gest) %>%
mutate(across(c(Sys, Dia), mean)) %>%
slice(1) %>%
group_by(ID) %>%
mutate(number = row_number())
Output
ID number Gest Sys Dia
<dbl> <int> <dbl> <dbl> <dbl>
1 27 1 27 120 90
2 27 2 29 122 89
3 27 3 32 123 90
4 27 4 33 124 94
5 46 1 28 126. 83.5
6 46 2 29 122 88
7 46 3 30 123 89
8 72 1 29 124 92
9 72 2 30 119 84.5
10 72 3 32 128 80
Note - I would keep in long form - but if you wanted wide again, you can add:
pivot_wider(id_cols = ID, names_from = number, values_from = c(Gest, Sys, Dia))
This involved change the structure of the table into the long format, averaging the duplicates and then reformatting back into the desired table:
library(tidyr)
library(dplyr)
df.1 <- data.frame(ID,Gest1,Sys1,Dia1,Gest2,Sys2,Dia2,Gest3,Sys3, Dia3,Gest4,Sys4,Dia4)
#convert data to long format
longdf <- df.1 %>% pivot_longer(!ID, names_to = c(".value", "time"), names_pattern = "(\\D+)(\\d)", values_to="count")
#average duplicate rows
temp<-longdf %>% group_by(ID, Gest) %>% summarize(Sys=mean(Sys), Dia=mean(Dia)) %>% mutate(time = row_number())
#convert back to wide format
answer<-temp %>% pivot_wider(ID, names_from = time, values_from = c("Gest", "Sys", "Dia"), names_glue = "{.value}{time}")
#resort the columns
answer <-answer[ , names(df.1)]
answer
# A tibble: 3 × 13
# Groups: ID [3]
ID Gest1 Sys1 Dia1 Gest2 Sys2 Dia2 Gest3 Sys3 Dia3 Gest4 Sys4 Dia4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 27 27 120 90 29 122 89 32 123 90 33 124 94
2 46 28 126. 83.5 29 122 88 30 123 89 NA NA NA
3 72 29 124 92 30 119 84.5 32 128 80 NA NA NA

R - sum a set number of rows from same column in a different data frame

I have the following data frame:
df <- data.frame( year = c(1985,1986,1987,1988,1989,1990,
1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,
2001,2002,2003,2004,2005,2006,2007,2008,2009, 2010,
2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020),
value = c(0,5,10,2,6,7,3,4,5,9,10,6,8,7,3,5,2,10,9,6,5,10,4,7,8,10,
4,6,8,9,2,3,7,6,2,1))
I want to create a second data frame (df2) that consists of 20 years intervals from the previous data frame, i.e.
df2 <- data.frame(year=c("1985-2005", "1986-2006","1987-2007", "1988-2008","1989-2009",
"1990-2010", "1991-2011","1992-2002", "1993-2003","1994-2004",
"1995-2005", "1996-2006","1997-2007", "1998-2008", "1999-2009",
"2000-2020"))
Now the value for df2 should be the sum of value on df for 20 years intervals
(i.e., for year "1985-2005" in df2, the value is the sum of values from 1985 until 2005 in df - Excel snapshot attached with final values)
How can I perform this calculation? Also any possible automation to define the year interval in df2 without having to type it?
A possible solution:
library(tidyverse)
df <- data.frame( year = c(1985,1986,1987,1988,1989,1990,
1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,
2001,2002,2003,2004,2005,2006,2007,2008,2009, 2010,
2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020),
value = c(0,5,10,2,6,7,3,4,5,9,10,6,8,7,3,5,2,10,9,6,5,10,4,7,8,10,
4,6,8,9,2,3,7,6,2,1))
df2 <- data.frame(year=c("1985-2005", "1986-2006","1987-2007", "1988-2008","1989-2009",
"1990-2010", "1991-2011","1992-2002", "1993-2003","1994-2004",
"1995-2005", "1996-2006","1997-2007", "1998-2008", "1999-2009",
"2000-2020"))
df2 %>%
separate(year, into = c("y1", "y2"), sep="-", convert = T, remove = F) %>%
rowwise %>%
mutate(value = sum(df$value[df$year >= y1 & df$year <= y2])) %>%
select(-y1, -y2) %>% ungroup
#> # A tibble: 16 × 2
#> year value
#> <chr> <dbl>
#> 1 1985-2005 122
#> 2 1986-2006 132
#> 3 1987-2007 131
#> 4 1988-2008 128
#> 5 1989-2009 134
#> 6 1990-2010 138
#> 7 1991-2011 135
#> 8 1992-2002 69
#> 9 1993-2003 74
#> 10 1994-2004 75
#> 11 1995-2005 71
#> 12 1996-2006 71
#> 13 1997-2007 69
#> 14 1998-2008 68
#> 15 1999-2009 69
#> 16 2000-2020 124

Conditional Calculation for a Column in R

I have the following data:
pop.2017 <- c(434,346,345,357)
pop.2018 <- c(334,336,325,345)
pop.2019 <- c(477,346,145,345)
pop.2020 <- c(474,366,341,300)
total <- c(34,36,34,35)
incident_month_yr <- c("2017-2","2017-5","2018-2","2019-2")
df <- data.frame(incident_month_yr,pop.2017,pop.2018,pop.2019,pop.2020,total)
df['perc'] <- NA
For rows where incident_month_yr contains 2017, I want perc to equal total/pop.2017
For rows where incident_month_yr contains 2018, I want perc to equal total/pop.2018
For rows where incident_month_yr contains 2019, I want perc to equal total/pop.2019
For rows where incident_month_yr contains 2020, I want perc to equal total/pop.2020
I've tried this:
df$perc[grepl(2017,df$incident_month_yr)] <- df$total/df$pop.2017
df$perc[grepl(2018,df$incident_month_yr)] <- df$total/df$pop.2018
df$perc[grepl(2019,df$incident_month_yr)] <- df$total/df$pop.2019
df$perc[grepl(2020,df$incident_month_yr)] <- df$total/df$pop.2020
However, it's not applying the calculations to specific rows like I want. How can I do this?
You can use the following solution:
library(dplyr)
library(stringr)
df %>%
mutate(perc = ifelse(str_detect(incident_month_yr, "2017"), total/pop.2017,
ifelse(str_detect(incident_month_yr, "2018"), total/pop.2018,
total/pop.2019)))
incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
1 2017-2 434 334 477 474 34 0.07834101
2 2017-5 346 336 346 366 36 0.10404624
3 2018-2 345 325 145 341 34 0.10461538
4 2019-2 357 345 345 300 35 0.10144928
Special Thanks to dear #akrun
We can also replace str_detect with grepl function from base R to use fewer packages and use case_when in place of ifelse as an unnested alternative.
df %>%
mutate(perc = case_when(
grepl("2017", incident_month_yr) ~ total/pop.2017,
grepl("2018", incident_month_yr) ~ total/pop.2018,
TRUE ~ total/pop.2019
))
incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
1 2017-2 434 334 477 474 34 0.07834101
2 2017-5 346 336 346 366 36 0.10404624
3 2018-2 345 325 145 341 34 0.10461538
4 2019-2 357 345 345 300 35 0.10144928
We can do this with match. Get the column names that have 'pop' substring ('nm1)', remove the characters that are not year from 'incident_month_yr', and the column name, use match to return the column index, cbind with the sequence of rows, extract the values from the 'pop' columns, divide by 'total' and assign it to 'perc' column
nm1 <- grep('pop', names(df), value = TRUE)
nm2 <- trimws(df$incident_month_yr, whitespace = '-.*')
nm3 <- trimws(nm1, whitespace = 'pop\\.')
df$perc <- df$total/df[nm1][cbind(seq_len(nrow(df)), match(nm2, nm3))]
df$perc
#[1] 0.07834101 0.10404624 0.10461538 0.10144928
In dplyr, an option is do rowwise, construct the column name from the 'incident_month_yr' with str_replace to capture the year part, append the 'pop.' as prefix, get the value and divide with 'total' column
library(stringr)
library(dplyr)
df %>%
rowwise %>%
mutate(perc = total/get(str_replace(incident_month_yr,
"(\\d{4})-\\d+", 'pop.\\1'))) %>%
ungroup
-output
# A tibble: 4 x 7
# incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2017-2 434 334 477 474 34 0.0783
#2 2017-5 346 336 346 366 36 0.104
#3 2018-2 345 325 145 341 34 0.105
#4 2019-2 357 345 345 300 35 0.101
Here are two approaches, one in base R and one using tidy data. The provided data is not tidy, that's why base R looks uncomfortable:
# Define the target
target <- c(0.07834101, 0.10404624, 0.10461538, 0.10144928)
That is our goal, calculating target.
First, use base R and ifelse:
result1 <- with(df,
ifelse(grepl(2017, incident_month_yr),
total/pop.2017,
ifelse(grepl(2018, incident_month_yr),
total/pop.2018,
ifelse(grepl(2019, incident_month_yr),
total/pop.2019,
ifelse(grepl(2020, incident_month_yr),
total/pop.2020,
NA)))))
identical(round(result1, 4), round(target, 4))
#> [1] TRUE
And, the tidy way, reshaping into tidy data and calculating the result:
library(dplyr)
library(tidyr)
result2 <- df %>% pivot_longer(starts_with("pop."), names_to = "pop", names_prefix = "pop.") %>%
filter(substr(incident_month_yr, 1, 4) == pop) %>%
mutate(perc = total/value) %>%
pull(perc)
identical(round(result2, 4), round(target, 4))
#> [1] TRUE

Resources