Pivot_longer into an already existing column - r

I want to pivot multiple columns, two by two into an already existing couple, from this
have <- tribble(
~egtest,~egorres, ~egorresu, ~hrorres,~hrorresu,~prorres,~prorresu,~uninteresing,
"qt", 500,"msec",90,"bpm",100,"msec", "cat",
"qtc", 370,"msec",NA,"bpm",103,"msec","dog",
"pra",83,"msec",79,"bpm",97,"msec","cat"
)
To this :
want <- tribble(
~egtest,~egorres, ~egorresu,~uninteresting,
"qt", 500,"msec","cat",
"qtc", 370,"msec","dog",
"pra",83,"msec","cat",
"hr",90,"bpm","cat",
"pr",100,"msec","cat",
"hr",NA,"bpm","dog",
"pr",103,"msec","dog",
"hr",79,"bpm","cat",
"pr",97,"msec","dog"
)
For now my code is
colstopivotEG <- function(table){
out <- subset(colnames(table),grepl(pattern = "orres\\b",colnames(table)))
out <- out[out != "egorres"]
#print(out)
return(out)
}
pivot_eg <- function(ndf){
EG1 <- pivot_longer(ndf,
cols = colstopivotEG(ndf),
names_pattern = "(.*)orres",
names_to="egtest",
values_to="egorres")
EG2 <- pivot_longer(ndf,
cols=ends_with("orresu"),
names_pattern = "(.*)orresu",
values_to="egorresu")
ndf <- bind_cols(EG1,EG2 %>% select(EGORRESU_STD))
}
But I can't seem to be able to pivot into an existing column, I'm out of ideas and any help could be great thanks !
PS: There's a lot of column that don't want to be pivoted

I would split the tibble into two by columns:
The columns starting with eg (keep them as they are)
The rest (pivot them).
Afterwards (after repairing the second tibble's names) we can bind the two tibbles together again.
library(dplyr)
library(tidyr)
eg <- have %>%
select(starts_with("eg"))
rest <- have %>%
select(-starts_with("eg")) %>%
pivot_longer(everything(),
names_pattern = "(hr|pr)(.+)",
names_to = c("egtest", ".value")) %>%
rename(egorres = orres,
egorresu = orresu)
bind_rows(eg, rest)
which gives
egtest egorres egorresu
<chr> <dbl> <chr>
1 qt 500 msec
2 qtc 370 msec
3 pra 83 msec
4 hr 90 bpm
5 pr 100 msec
6 hr NA bpm
7 pr 103 msec
8 hr 79 bpm
9 pr 97 msec

Another possible solution:
library(tidyverse)
bind_rows(have[c(1:3,8)],
map(list(c(4:5,8), 6:8),
~ bind_cols(egtest = str_sub(names(have[.x])[1], 1, 2), have[.x] %>%
set_names(names(have[c(2:3,8)])))))
#> # A tibble: 9 × 4
#> egtest egorres egorresu uninteresing
#> <chr> <dbl> <chr> <chr>
#> 1 qt 500 msec cat
#> 2 qtc 370 msec dog
#> 3 pra 83 msec cat
#> 4 hr 90 bpm cat
#> 5 hr NA bpm dog
#> 6 hr 79 bpm cat
#> 7 pr 100 msec cat
#> 8 pr 103 msec dog
#> 9 pr 97 msec cat

Related

Group and add variable of type stock and another type in a single step?

I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205

Efficient way to repeat operations with columns with similar name in R

I am a beginner with R and have found myself repeatedly running into a problem of this kind. Say I have a dataframe with columns:
company, shares_2010, shares_2011, ... , shares_2020, share_price_2010, ... , share_price_2020
TeslaInc 1000 1200 2000 8 40
.
.
.
I then want to go ahead and calculate the market value in each year. Ordinarily I would do it this way:
dataframe <- dataframe %>%
mutate(value_2010 = shares_2010*share_price_2010,
value_2011 = shares_2011*share_price_2011,
.
:
value_2020 = shares_2020*share_price_2020)
Clearly, all of this is rather cumbersome to type out each time and it cannot be made dynamic with respect to the number of time periods included. Is there any clever way to do these operations in one line instead? I am suspecting something may be possible to do with a combination of starts_with() and some lambda function, but I just haven't been able to figure out how to make the correct things multiply yet. Surely the tidyverse must have a better way to do this?
Any help is much appreciated!
You're right, this is a very common situation in data management.
Let's make a minimal, reproducible example:
dat <- data.frame(
company = c("TeslaInc", "Merta"),
shares_2010 = c(1000L, 1500L),
shares_2011 = c(1200L, 1100L),
shareprice_2010 = 8:7,
shareprice_2011 = c(40L, 12L)
)
dat
#> company shares_2010 shares_2011 shareprice_2010 shareprice_2011
#> 1 TeslaInc 1000 1200 8 40
#> 2 Merta 1500 1100 7 12
This dataset has two issues:
It's in a wide format. This is relatively easy to visualise for humans, but it's not ideal for data analysis. We can fix this with pivot_longer() from tidyr.
Each column actually contains two variables: measure (share or share price) and year. We can fix this with separate() from the same package.
library(tidyr)
dat_reshaped <- dat |>
pivot_longer(shares_2010:shareprice_2011) |>
separate(name, into = c("name", "year")) |>
pivot_wider(everything(), values_from = value, names_from = name)
dat_reshaped
#> # A tibble: 4 × 4
#> company year shares shareprice
#> <chr> <chr> <int> <int>
#> 1 TeslaInc 2010 1000 8
#> 2 TeslaInc 2011 1200 40
#> 3 Merta 2010 1500 7
#> 4 Merta 2011 1100 12
The last pivot_wider() is needed to have shares and shareprice as two separate columns, for ease of further calculations.
We can finally use mutate() to calculate in one go all the new values.
dat_reshaped |>
dplyr::mutate(value = shares * shareprice)
#> # A tibble: 4 × 5
#> company year shares shareprice value
#> <chr> <chr> <int> <int> <int>
#> 1 TeslaInc 2010 1000 8 8000
#> 2 TeslaInc 2011 1200 40 48000
#> 3 Merta 2010 1500 7 10500
#> 4 Merta 2011 1100 12 13200
I recommend you read this chapter of R4DS to better understand these concepts - it's worth the effort!
I think further analysis will be simpler if you reshape your data long.
Here, we can extract the shares, share_price, and year from the header names using pivot_longer. Here, I specify that I want to split the headers into two pieces separated by _, and I want to put the name (aka .value) from the beginning of the header (that is, share or share_price) next to the year that came from the end of the header.
Then the calculation is a simple one-liner.
library(tidyr); library(dplyr)
data.frame(company = "Tesla",
shares_2010 = 5, shares_2011 = 6,
share_price_2010 = 100, share_price_2011 = 110) %>%
pivot_longer(-company,
names_to = c(".value", "year"),
names_pattern = "(.*)_(.*)") %>%
mutate(value = shares * share_price)
# A tibble: 2 × 5
company year shares share_price value
<chr> <chr> <dbl> <dbl> <dbl>
1 Tesla 2010 5 100 500
2 Tesla 2011 6 110 660
I agree with the other posts about pivoting this data into a longer format. Just to add a different approach that works well with this type of example: you can create a list of expressions and then use the splice operator !!! to evaluate these expressions within your context:
library(purrr)
library(dplyr)
library(rlang)
library(glue)
lexprs <- set_names(2010:2011, paste0("value_", 2010:2011)) %>%
map_chr(~ glue("shares_{.x} * share_price_{.x}")) %>%
parse_exprs()
df %>%
mutate(!!! lexprs)
Output
company shares_2010 shares_2011 share_price_2010 share_price_2011 value_2010
1 TeslaInc 1000 1200 8 40 8000
2 Merta 1500 1100 7 12 10500
value_2011
1 48000
2 13200
Data
Thanks to Andrea M
structure(list(company = c("TeslaInc", "Merta"), shares_2010 = c(1000L,
1500L), shares_2011 = c(1200L, 1100L), share_price_2010 = 8:7,
share_price_2011 = c(40L, 12L)), class = "data.frame", row.names = c(NA,
-2L))
How it works
With this usage, the splice operator takes a named list of expressions. The names of the list become the variable names and the expressions are evaluated in the context of your mutate statement.
> lexprs
$value_2010
shares_2010 * share_price_2010
$value_2011
shares_2011 * share_price_2011
To see how this injection will resolve, we can use rlang::qq_show:
> rlang::qq_show(df %>% mutate(!!! lexprs))
df %>% mutate(value_2010 = shares_2010 * share_price_2010, value_2011 = shares_2011 *
share_price_2011)
It is indeed likely you may need to have your data in a long format. But in case you don't, you can do this:
# thanks Andrea M!
df <- data.frame(
company=c("TeslaInc", "Merta"),
shares_2010=c(1000L, 1500L),
shares_2011=c(1200L, 1100L),
share_price_2010=8:7,
share_price_2011=c(40L, 12L)
)
years <- sub('shares_', '', grep('^shares_', names(df), value=T))
for (year in years) {
df[[paste0('value_', year)]] <-
df[[paste0('shares_', year)]] * df[[paste0('share_price_', year)]]
}
If you wanted to avoid the loop (for (...) {...}) you can use this instead:
sp <- df[, paste0('shares_', years)] * df[, paste0('share_price_', years)]
names(sp) <- paste0('value_', years)
df <- cbind(df, sp)

How to find duplicate dates within a row in R, and then replace associated values with the mean?

There are some similar questions, however I haven't been able to find the solution for my data:
ID <- c(27,46,72)
Gest1 <- c(27,28,29)
Sys1 <- c(120,123,124)
Dia1 <- c(90,89,92)
Gest2 <- c(29,28,30)
Sys2 <- c(122,130,114)
Dia2 <- c(89,78,80)
Gest3 <- c(32,29,30)
Sys3 <- c(123,122,124)
Dia3 <- c(90,88,89)
Gest4 <- c(33,30,32)
Sys4 <- c(124,123,128)
Dia4 <- c(94,89,80)
df.1 <- data.frame(ID,Gest1,Sys1,Dia1,Gest2,Sys2,Dia2,Gest3,Sys3,
Dia3,Gest4,Sys4,Dia4)
df.1
What I need to do is identify where there are any cases of gestational age duplicates (variables beginning with Gest), and then find the mean of the associated Sys and Dia variables.
Once the mean has been calculated, I need to replace the duplicates with just 1 Gest variable, and the mean of the Sys variable and the mean of the Dia variable. Everything after those duplicates should then be moved up the dataframe.
Here is what it should look like:
df.2
My real data has 25 Gest variables with 25 associated Sys variables and 25 association Dia variables.
Sorry if this is confusing! I've tried to write an ok question but it is my first time using stack overflow.
Thank you!!
This is easier to manage in long (and tidy) format.
Using tidyverse, you can use pivot_longer to put into long form. After grouping by ID and Gest you can substitute Sys and Dia values with the mean. If there are more than one Gest for a given ID it will then use the average.
Then, you can keep that row of data with slice. After grouping by ID, you can renumber after combining those with common Gest values.
library(tidyverse)
df.1 %>%
pivot_longer(cols = -ID, names_to = c(".value", "number"), names_pattern = "(\\w+)(\\d+)") %>%
group_by(ID, Gest) %>%
mutate(across(c(Sys, Dia), mean)) %>%
slice(1) %>%
group_by(ID) %>%
mutate(number = row_number())
Output
ID number Gest Sys Dia
<dbl> <int> <dbl> <dbl> <dbl>
1 27 1 27 120 90
2 27 2 29 122 89
3 27 3 32 123 90
4 27 4 33 124 94
5 46 1 28 126. 83.5
6 46 2 29 122 88
7 46 3 30 123 89
8 72 1 29 124 92
9 72 2 30 119 84.5
10 72 3 32 128 80
Note - I would keep in long form - but if you wanted wide again, you can add:
pivot_wider(id_cols = ID, names_from = number, values_from = c(Gest, Sys, Dia))
This involved change the structure of the table into the long format, averaging the duplicates and then reformatting back into the desired table:
library(tidyr)
library(dplyr)
df.1 <- data.frame(ID,Gest1,Sys1,Dia1,Gest2,Sys2,Dia2,Gest3,Sys3, Dia3,Gest4,Sys4,Dia4)
#convert data to long format
longdf <- df.1 %>% pivot_longer(!ID, names_to = c(".value", "time"), names_pattern = "(\\D+)(\\d)", values_to="count")
#average duplicate rows
temp<-longdf %>% group_by(ID, Gest) %>% summarize(Sys=mean(Sys), Dia=mean(Dia)) %>% mutate(time = row_number())
#convert back to wide format
answer<-temp %>% pivot_wider(ID, names_from = time, values_from = c("Gest", "Sys", "Dia"), names_glue = "{.value}{time}")
#resort the columns
answer <-answer[ , names(df.1)]
answer
# A tibble: 3 × 13
# Groups: ID [3]
ID Gest1 Sys1 Dia1 Gest2 Sys2 Dia2 Gest3 Sys3 Dia3 Gest4 Sys4 Dia4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 27 27 120 90 29 122 89 32 123 90 33 124 94
2 46 28 126. 83.5 29 122 88 30 123 89 NA NA NA
3 72 29 124 92 30 119 84.5 32 128 80 NA NA NA

R - sum a set number of rows from same column in a different data frame

I have the following data frame:
df <- data.frame( year = c(1985,1986,1987,1988,1989,1990,
1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,
2001,2002,2003,2004,2005,2006,2007,2008,2009, 2010,
2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020),
value = c(0,5,10,2,6,7,3,4,5,9,10,6,8,7,3,5,2,10,9,6,5,10,4,7,8,10,
4,6,8,9,2,3,7,6,2,1))
I want to create a second data frame (df2) that consists of 20 years intervals from the previous data frame, i.e.
df2 <- data.frame(year=c("1985-2005", "1986-2006","1987-2007", "1988-2008","1989-2009",
"1990-2010", "1991-2011","1992-2002", "1993-2003","1994-2004",
"1995-2005", "1996-2006","1997-2007", "1998-2008", "1999-2009",
"2000-2020"))
Now the value for df2 should be the sum of value on df for 20 years intervals
(i.e., for year "1985-2005" in df2, the value is the sum of values from 1985 until 2005 in df - Excel snapshot attached with final values)
How can I perform this calculation? Also any possible automation to define the year interval in df2 without having to type it?
A possible solution:
library(tidyverse)
df <- data.frame( year = c(1985,1986,1987,1988,1989,1990,
1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,
2001,2002,2003,2004,2005,2006,2007,2008,2009, 2010,
2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020),
value = c(0,5,10,2,6,7,3,4,5,9,10,6,8,7,3,5,2,10,9,6,5,10,4,7,8,10,
4,6,8,9,2,3,7,6,2,1))
df2 <- data.frame(year=c("1985-2005", "1986-2006","1987-2007", "1988-2008","1989-2009",
"1990-2010", "1991-2011","1992-2002", "1993-2003","1994-2004",
"1995-2005", "1996-2006","1997-2007", "1998-2008", "1999-2009",
"2000-2020"))
df2 %>%
separate(year, into = c("y1", "y2"), sep="-", convert = T, remove = F) %>%
rowwise %>%
mutate(value = sum(df$value[df$year >= y1 & df$year <= y2])) %>%
select(-y1, -y2) %>% ungroup
#> # A tibble: 16 × 2
#> year value
#> <chr> <dbl>
#> 1 1985-2005 122
#> 2 1986-2006 132
#> 3 1987-2007 131
#> 4 1988-2008 128
#> 5 1989-2009 134
#> 6 1990-2010 138
#> 7 1991-2011 135
#> 8 1992-2002 69
#> 9 1993-2003 74
#> 10 1994-2004 75
#> 11 1995-2005 71
#> 12 1996-2006 71
#> 13 1997-2007 69
#> 14 1998-2008 68
#> 15 1999-2009 69
#> 16 2000-2020 124

Struggling to Create a Pivot Table in R

I am very, very new to any type of coding language. I am used to Pivot tables in Excel, and trying to replicate a pivot I have done in Excel in R. I have spent a long time searching the internet/ YouTube, but I just can't get it to work.
I am looking to produce a table in which I the left hand side column shows a number of locations, and across the top of the table it shows different pages that have been viewed. I want to show in the table the number of views per location which each of these pages.
The data frame 'specificreports' shows all views over the past year for different pages on an online platform. I want to filter for the month of October, and then pivot the different Employee Teams against the number of views for different pages.
specificreports <- readxl::read_excel("Multi-Tab File - Dashboard
Usage.xlsx", sheet = "Specific Reports")
specificreportsLocal <- tbl_df(specificreports)
specificreportsLocal %>% filter(Month == "October") %>%
group_by("Employee Team") %>%
This bit works, in that it groups the different team names and filters entries for the month of October. After this I have tried using the summarise function to summarise the number of hits but can't get it to work at all. I keep getting errors regarding data type. I keep getting confused because solutions I look up keep using different packages.
I would appreciate any help, using the simplest way of doing this as I am a total newbie!
Thanks in advance,
Holly
let's see if I can help a bit. It's hard to know what your data looks like from the info you gave us. So I'm going to guess and make some fake data for us to play with. It's worth noting that having field names with spaces in them is going to make your life really hard. You should start by renaming your fields to something more manageable. Since I'm just making data up, I'll give my fields names without spaces:
library(tidyverse)
## this makes some fake data
## a data frame with 3 fields: month, team, value
n <- 100
specificreportsLocal <-
data.frame(
month = sample(1:12, size = n, replace = TRUE),
team = letters[1:5],
value = sample(1:100, size = n, replace = TRUE)
)
That's just a data frame called specificreportsLocal with three fields: month, team, value
Let's do some things with it:
# This will give us total values by team when month = 10
specificreportsLocal %>%
filter(month == 10) %>%
group_by(team) %>%
summarize(total_value = sum(value))
#> # A tibble: 4 x 2
#> team total_value
#> <fct> <int>
#> 1 a 119
#> 2 b 172
#> 3 c 67
#> 4 d 229
I think that's sort of like what you already did, except I added the summarize to show how it works.
Now let's use all months and reshape it from 'long' to 'wide'
# if I want to see all months I leave out the filter and
# add a group_by month
specificreportsLocal %>%
group_by(team, month) %>%
summarize(total_value = sum(value)) %>%
head(5) # this just shows the first 5 values
#> # A tibble: 5 x 3
#> # Groups: team [1]
#> team month total_value
#> <fct> <int> <int>
#> 1 a 1 17
#> 2 a 2 46
#> 3 a 3 91
#> 4 a 4 69
#> 5 a 5 83
# to make this 'long' data 'wide', we can use the `spread` function
specificreportsLocal %>%
group_by(team, month) %>%
summarize(total_value = sum(value)) %>%
spread(team, total_value)
#> # A tibble: 12 x 6
#> month a b c d e
#> <int> <int> <int> <int> <int> <int>
#> 1 1 17 122 136 NA 167
#> 2 2 46 104 158 94 197
#> 3 3 91 NA NA NA 11
#> 4 4 69 120 159 76 98
#> 5 5 83 186 158 19 208
#> 6 6 103 NA 118 105 84
#> 7 7 NA NA 73 127 107
#> 8 8 NA 130 NA 166 99
#> 9 9 125 72 118 135 71
#> 10 10 119 172 67 229 NA
#> 11 11 107 81 NA 131 49
#> 12 12 174 87 39 NA 41
Created on 2018-12-01 by the reprex package (v0.2.1)
Now I'm not really sure if that's what you want. So feel free to make a comment on this answer if you need any of this clarified.
Welcome to Stack Overflow!
I'm not sure I correctly understand your need without a data sample, but this may work for you:
library(rpivotTable)
specificreportsLocal %>% filter(Month == "October")
rpivotTable(specificreportsLocal, rows="Employee Team", cols="page", vals="views", aggregatorName = "Sum")
Otherwise, if you do not need it interactive (as the Pivot Tables in Excel), this may work as well:
specificreportsLocal %>% filter(Month == "October") %>%
group_by_at(c("Employee Team", "page")) %>%
summarise(nr_views = sum(views, na.rm=TRUE))

Resources