R - Transpose columns and rows with conditions - r

I am working with the dataframe 'by_class_survival' and I am trying to convert in other format, changing the rows and columns plus including conditions, I have already solved in a very rustic way, so but I am wondering if there is a better way to transpose columns and rows, plus adding conditions at the moment to create the transposition.
library(dplyr)
titanic_tbl <- dplyr::tbl_df(Titanic)
titanic_tbl <- titanic_tbl %>%
mutate_at(vars(Class:Survived), funs(factor))
by_class_survival <- titanic_tbl %>%
group_by(Class, Survived) %>%
summarize(Count = sum(n))
Original dataframe
# Class Survived Count
# 1 1st No 122
# 2 1st Yes 203
# 3 2nd No 167
# 4 2nd Yes 118
# 5 3rd No 528
# 6 3rd Yes 178
# 7 Crew No 673
# 8 Crew Yes 212
Creating a new dataframe based on the values from by_class_survival
first <- c(122,203)
second <- c(167, 118)
third <- c(528,178)
crew <- c(673,212)
titanic.df = data.frame(first,second,third,crew)
library(data.table)
t_titanic.df <- transpose(titanic.df)
rownames(t_titanic.df) <- colnames(titanic.df)
colnames(t_titanic.df) <- c("No survivor", "Survivor")
Expected result
## No survivor Survivor
## first 122 203
## second 167 118
## third 528 178
## crew 673 212
There is a better way to reach the expected result?

You can do it in one step with reshape2::dcast:
library(reshape2)
library(dplyr)
titanic_tbl %>%
dcast(Class ~ Survived, value.var = "n", sum)
Class No Yes
1 1st 122 203
2 2nd 167 118
3 3rd 528 178
4 Crew 673 212
or you can use tidyr::spread on the summarised data frame:
library(tidyr)
titanic_tbl %>%
group_by(Class, Survived) %>%
summarise(sum = sum(n)) %>%
spread(Survived, sum)
# A tibble: 4 x 3
# Groups: Class [4]
Class No Yes
<chr> <dbl> <dbl>
1 1st 122 203
2 2nd 167 118
3 3rd 528 178
4 Crew 673 212

Related

Group and add variable of type stock and another type in a single step?

I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205

Pivot wider to one row in R

Here is the sample code that I am using
library(dplyr)
naics <- c("000000","000000",123000,123000)
year <- c(2020,2021,2020,2021)
January <- c(250,251,6,9)
February <- c(252,253,7,16)
March <- c(254,255,8,20)
sample2 <- data.frame (naics, year, January, February, March)
Here is the intended result
Jan2020 Feb2020 March2020 Jan2021 Feb2021 March2021
000000 250 252 254 251 253 255
123000 6 7 8 9 16 20
Is this something that is done with pivot_wider or is it more complex?
We use pivot_wider by selecting the values_from with the month column, names_from as 'year' and then change the column name format in names_glue and if needed convert the 'naics' to row names with column_to_rownames (from tibble)
library(tidyr)
library(tibble)
pivot_wider(sample2, names_from = year, values_from = January:March,
names_glue = "{substr(.value, 1, 3)}{year}")%>%
column_to_rownames('naics')
-output
Jan2020 Jan2021 Feb2020 Feb2021 Mar2020 Mar2021
000000 250 251 252 253 254 255
123000 6 9 7 16 8 20
With reshape function from BaseR,
reshape(sample2, dir = "wide", sep="",
idvar = "naics",
timevar = "year",
new.row.names = unique(naics))[,-1]
# January2020 February2020 March2020 January2021 February2021 March2021
# 000000 250 252 254 251 253 255
# 123000 6 7 8 9 16 20
This takes a longer route than #akrun's answer. I will leave this here in case it may help with more intuition on the steps being taken. Otherwise, #akrun's answer is more resource efficient.
sample2 %>%
tidyr::pivot_longer(-c(naics, year), names_to = "month",
values_to = "value") %>%
mutate(Month=paste0(month, year)) %>%
select(-year, - month) %>%
tidyr::pivot_wider(names_from = Month,values_from = value)
# A tibble: 2 x 7
naics January2020 February2020 March2020 January2021 February2021
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 000000 250 252 254 251 253
2 123000 6 7 8 9 16
# ... with 1 more variable: March2021 <dbl>

Conditional Calculation for a Column in R

I have the following data:
pop.2017 <- c(434,346,345,357)
pop.2018 <- c(334,336,325,345)
pop.2019 <- c(477,346,145,345)
pop.2020 <- c(474,366,341,300)
total <- c(34,36,34,35)
incident_month_yr <- c("2017-2","2017-5","2018-2","2019-2")
df <- data.frame(incident_month_yr,pop.2017,pop.2018,pop.2019,pop.2020,total)
df['perc'] <- NA
For rows where incident_month_yr contains 2017, I want perc to equal total/pop.2017
For rows where incident_month_yr contains 2018, I want perc to equal total/pop.2018
For rows where incident_month_yr contains 2019, I want perc to equal total/pop.2019
For rows where incident_month_yr contains 2020, I want perc to equal total/pop.2020
I've tried this:
df$perc[grepl(2017,df$incident_month_yr)] <- df$total/df$pop.2017
df$perc[grepl(2018,df$incident_month_yr)] <- df$total/df$pop.2018
df$perc[grepl(2019,df$incident_month_yr)] <- df$total/df$pop.2019
df$perc[grepl(2020,df$incident_month_yr)] <- df$total/df$pop.2020
However, it's not applying the calculations to specific rows like I want. How can I do this?
You can use the following solution:
library(dplyr)
library(stringr)
df %>%
mutate(perc = ifelse(str_detect(incident_month_yr, "2017"), total/pop.2017,
ifelse(str_detect(incident_month_yr, "2018"), total/pop.2018,
total/pop.2019)))
incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
1 2017-2 434 334 477 474 34 0.07834101
2 2017-5 346 336 346 366 36 0.10404624
3 2018-2 345 325 145 341 34 0.10461538
4 2019-2 357 345 345 300 35 0.10144928
Special Thanks to dear #akrun
We can also replace str_detect with grepl function from base R to use fewer packages and use case_when in place of ifelse as an unnested alternative.
df %>%
mutate(perc = case_when(
grepl("2017", incident_month_yr) ~ total/pop.2017,
grepl("2018", incident_month_yr) ~ total/pop.2018,
TRUE ~ total/pop.2019
))
incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
1 2017-2 434 334 477 474 34 0.07834101
2 2017-5 346 336 346 366 36 0.10404624
3 2018-2 345 325 145 341 34 0.10461538
4 2019-2 357 345 345 300 35 0.10144928
We can do this with match. Get the column names that have 'pop' substring ('nm1)', remove the characters that are not year from 'incident_month_yr', and the column name, use match to return the column index, cbind with the sequence of rows, extract the values from the 'pop' columns, divide by 'total' and assign it to 'perc' column
nm1 <- grep('pop', names(df), value = TRUE)
nm2 <- trimws(df$incident_month_yr, whitespace = '-.*')
nm3 <- trimws(nm1, whitespace = 'pop\\.')
df$perc <- df$total/df[nm1][cbind(seq_len(nrow(df)), match(nm2, nm3))]
df$perc
#[1] 0.07834101 0.10404624 0.10461538 0.10144928
In dplyr, an option is do rowwise, construct the column name from the 'incident_month_yr' with str_replace to capture the year part, append the 'pop.' as prefix, get the value and divide with 'total' column
library(stringr)
library(dplyr)
df %>%
rowwise %>%
mutate(perc = total/get(str_replace(incident_month_yr,
"(\\d{4})-\\d+", 'pop.\\1'))) %>%
ungroup
-output
# A tibble: 4 x 7
# incident_month_yr pop.2017 pop.2018 pop.2019 pop.2020 total perc
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2017-2 434 334 477 474 34 0.0783
#2 2017-5 346 336 346 366 36 0.104
#3 2018-2 345 325 145 341 34 0.105
#4 2019-2 357 345 345 300 35 0.101
Here are two approaches, one in base R and one using tidy data. The provided data is not tidy, that's why base R looks uncomfortable:
# Define the target
target <- c(0.07834101, 0.10404624, 0.10461538, 0.10144928)
That is our goal, calculating target.
First, use base R and ifelse:
result1 <- with(df,
ifelse(grepl(2017, incident_month_yr),
total/pop.2017,
ifelse(grepl(2018, incident_month_yr),
total/pop.2018,
ifelse(grepl(2019, incident_month_yr),
total/pop.2019,
ifelse(grepl(2020, incident_month_yr),
total/pop.2020,
NA)))))
identical(round(result1, 4), round(target, 4))
#> [1] TRUE
And, the tidy way, reshaping into tidy data and calculating the result:
library(dplyr)
library(tidyr)
result2 <- df %>% pivot_longer(starts_with("pop."), names_to = "pop", names_prefix = "pop.") %>%
filter(substr(incident_month_yr, 1, 4) == pop) %>%
mutate(perc = total/value) %>%
pull(perc)
identical(round(result2, 4), round(target, 4))
#> [1] TRUE

Choose groups with consecutive year-quarters

I hope to choose the identifiers that have consecutive year-quarter records. For example, ID 111 will be selected because it has all year-quarters. ID 113 will be selected because the year-quarter combinations are consecutive, although the ID only has a portion of the total year-quarters. ID 112 will not be selected because the year-quarter is not consecutive. It lacks 201601, 201602, 201603.
Identifer year-quarter
111 201503
111 201504
111 201601
111 201602
111 201603
111 201604
112 201503
112 201504
112 201604
113 201503
113 201504
113 201601
My current code (below) can only deal with selecting IDs that have the full year-quarter combinations. I wonder how to achieve my desired outcome.
df2 = group_by(df1, Identifer) %>% summarize(total = n()) %>% filter(total =6)
The desired outcome is
Identifer
111
113
To select 'Identifiers', convert 'year.quarter' to zoo::year.qtr, take difference between consecutive values by group, check if all differerences are 0.25*.
library(zoo)
tapply(as.yearqtr(as.character(d$year.quarter), format = "%Y%q"), d$Identifer,
FUN = function(x) all(diff(as.numeric(x)) == 0.25))
# 111 112 113
# TRUE FALSE TRUE
To select corresponding rows, use a similar logic with ave:
d[as.logical(ave(as.yearqtr(as.character(d$year.quarter), format = "%Y%q"), d$Identifer,
FUN = function(x) all(diff(x) == 0.25))), ]
# Identifer year.quarter
# 1 111 201503
# 2 111 201504
# 3 111 201601
# 4 111 201602
# 5 111 201603
# 6 111 201604
# 10 113 201503
# 11 113 201504
# 12 113 201601
*From ?as.yearqtr:
The "yearqtr" class is used to represent quarterly data. Internally it holds the data as year plus 0 for Quarter 1, 1/4 for Quarter 2 and so on
The post was improved by comments from #G.Grothendieck. Thanks!
One way , we could do this is by using dplyr and lubridate together. We can group_by Identifier and use yq function to convert year-quarter to date and then take difference between those consecutive dates and get all the groups where all the dates are in the range of 90-120 as maximum amount of days we can allow between one quarter.
library(dplyr)
library(lubridate)
df %>%
group_by(Identifer) %>%
mutate(yearq = c(90, diff(yq(year.quarter)))) %>%
filter(all(yearq > 89 & yearq < 120)) %>%
select(Identifer) %>%
unique()
# Identifer
# <int>
#1 111
#2 113

String split: need to account for space before characters?

I am adapting a code for my own needs, which has problems. I've been able to address most of the issues but am stuck on this current step. I've uploaded a pdf into R and have done a series of steps to manipulate the file for text mining.
I'm now trying to split each line of text. Useful is a (?)list of characters, and I've called the 11th object.
useful[11]
>" Busti
169 425 Total 2,786 5,259 Franklin
256 410"
As you can see, there's a big space before Busti. Useful[11] is the last row in the pdf page. Essentially, the first column is blank, Busti is the 2nd column, Total is the third column, and Franklin is the 4th column of the same row.
I am then splitting useful[11] so each column is now an individual object.
split <-
strsplit(useful,
"(?<=[0-9])\\s+(?=[A-Z])|(?<=[aA-zZ])\\s{2,}+(?=[A-Z])",
perl = T)
split[11]
[[1]]
[1] " Busti
169 425"
[2] "Total 2,786 5,259"
[3] "Franklin 256 410"
Instead of recognizing each column as an object, R is seeing obj 1-Busti, object 2-Total, object 3-Franklin whereas I want: object 1-space, object 2-Busti, and so on.
For example, in the row above i.e. useful[10], there is no empty space in any of the columns so:
useful[10]
[1] "Total 1,399 2,915 Arkwright 154 320 Smyrna 179 319 Deposit 110 169"
So when I use the split function, I get:
split[10]
[[1]]
[1] "Total 1,399 2,915" "Arkwright 154 320" "Smyrna 179 319"
[4] "Deposit 110 169"
Could someone help me figure out how to do the proper regex to account for this issue? Thank you in advance!
Here's an approach using the tidyverse and purrr:
library(tidyverse)
useful <- c(" Busti
169 425 Total 2,786 5,259 Franklin
256 410", "Total 1,399 2,915 Arkwright 154 320 Smyrna 179 319 Deposit 110 169")
map(useful, str_squish) %>%
str_split("\\s+")
# [[1]]
# [1] "Busti" "169" "425" "Total" "2,786" "5,259" "Franklin" "256" "410"
#
# [[2]]
# [1] "Total" "1,399" "2,915" "Arkwright" "154" "320" "Smyrna" "179" "319" "Deposit" "110"
# [12] "169"
Alternatively:
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])")
# [[1]]
# [1] "Busti 169 425" "Total 2,786 5,259" "Franklin 256 410"
#
# [[2]]
# [1] "Total 1,399 2,915" "Arkwright 154 320" "Smyrna 179 319" "Deposit 110 169"
And then you may want to consider...
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest
# # A tibble: 7 x 2
# name value
# <int> <chr>
# 1 1 Busti 169 425
# 2 1 Total 2,786 5,259
# 3 1 Franklin 256 410
# 4 2 Total 1,399 2,915
# 5 2 Arkwright 154 320
# 6 2 Smyrna 179 319
# 7 2 Deposit 110 169
Or even...
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest %>%
separate(value, c("Group", "Item1", "Item2"), sep = "\\s") %>%
mutate_at(vars(starts_with("Item")), ~ str_replace(., ",", "") %>% as.numeric)
# # A tibble: 7 x 4
# name Group Item1 Item2
# <int> <chr> <dbl> <dbl>
# 1 1 Busti 169 425
# 2 1 Total 2786 5259
# 3 1 Franklin 256 410
# 4 2 Total 1399 2915
# 5 2 Arkwright 154 320
# 6 2 Smyrna 179 319
# 7 2 Deposit 110 169
And finally, if the number of "items" is unknown or of varying length, you'll want to do something like the following and/or reference this question:
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest %>%
mutate(to_sep = str_split(value, "\\s")) %>%
unnest(to_sep) %>%
group_by(value) %>%
mutate(row = row_number()) %>%
spread(row, to_sep)
# # A tibble: 7 x 5
# # Groups: value [7]
# name value `1` `2` `3`
# <int> <chr> <chr> <chr> <chr>
# 1 1 Busti 169 425 Busti 169 425
# 2 1 Franklin 256 410 Franklin 256 410
# 3 1 Total 2,786 5,259 Total 2,786 5,259
# 4 2 Arkwright 154 320 Arkwright 154 320
# 5 2 Deposit 110 169 Deposit 110 169
# 6 2 Smyrna 179 319 Smyrna 179 319
# 7 2 Total 1,399 2,915 Total 1,399 2,915
You may want to consider breaking this off into a more specific question, especially now that you are providing the pdf and ask more directly what you are trying to achieve. That being said, I'm not sure the blanks are relevant here, as you could use the following pipeline.
library(pdftools)
library(tidyverse)
text <- pdf_text("https://www.dec.ny.gov/docs/wildlife_pdf/09deerrpt.pdf")
clean_text <-
text %>%
str_squish() %>%
magrittr::extract(., 14:17) %>%
paste(collapse = " ") %>%
# First get rid of the header text
str_remove("New York State Department of Environmental.*TOTAL TAKE. ") %>%
# Now get rid of Page numbers, e.g., Page 14, Page 15
str_remove_all("Page [[:digit:]]{2}") %>%
# Get rid of the COUNTY labels since they're not going to line up anyway...
str_remove_all("[A-Z]{2,}") %>%
# Remove Totals since they won't line up...
str_remove("Statewide Totals.*") %>%
# Remove commas from numbers
str_remove_all(",") %>%
# Another squish for good measure and for some less than perfect removals above
str_squish()
clean_text %>%
# Remove the individual total lines
str_remove_all("Total\\s\\w+\\s\\w+") %>%
str_squish() %>%
str_extract_all("[A-Za-z ]+\\s\\d+\\s\\d+") %>%
unlist %>%
str_squish() %>%
data_frame(by_line = .) %>%
extract(
by_line, c("location", "adult_take", "total_take"), regex = "([A-Za-z ]+\\s?)(\\d+\\s?)(\\d+\\s?)"
) %>%
mutate(
location = str_squish(location),
adult_take = str_squish(adult_take) %>% as.numeric,
total_take = str_squish(total_take) %>% as.numeric
)
# # A tibble: 943 x 3
# location adult_take total_take
# <chr> <dbl> <dbl>
# 1 Carroll 103 215
# 2 Albany City 24 41
# 3 Allegany 115 231
# 4 Charlotte 116 248
# 5 Altona 50 87
# 6 Berne 163 292
# 7 Ashford 338 721
# 8 Chautauqua 242 613
# 9 Ausable 18 21
# 10 Bethlehem 141 280
# # ... with 933 more rows

Resources