dplyr arrange is not working while order is fine - r

I am trying to obtain the largest 10 investors in a country but obtain confusing result using arrange in dplyr versus order in base R.
head(fdi_partner)
give the following results
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Total registered capital (Mill. USD)(*)`
<chr> <chr> <chr>
1 TOTAL 1818 38854.3
2 Singapore 231 11358.66
3 Korea Rep.of 377 7679.9
4 Japan 204 4325.79
5 Netherlands 24 4209.64
6 China, PR 216 3001.79
and
fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric) %>%
arrange("Number of projects") %>%
head()
give almost the same result
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Singapore 231 11359.
3 Korea Rep.of 377 7680.
4 Japan 204 4326.
5 Netherlands 24 4210.
6 China, PR 216 3002.
while the following code is working fine with base R
head(fdi_partner)
fdi_numeric <- fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric)
head(fdi_numeric[order(fdi_numeric$"Number of projects", decreasing = TRUE), ], n=11)
which gives
# A tibble: 11 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Korea Rep.of 377 7680.
3 Singapore 231 11359.
4 China, PR 216 3002.
5 Japan 204 4326.
6 Hong Kong SAR (China) 132 2365.
7 United States 83 783.
8 Taiwan 66 1464.
9 United Kingdom 50 331.
10 F.R Germany 37 131.
11 Thailand 36 370.
Can anybody help explain what's wrong with me?

dplyr (and more generally tidyverse packages) accept only unquoted variable names. If your variable name has a space in it, you must wrap it in backticks:
library(dplyr)
test <- data.frame(`My variable` = c(3, 1, 2), var2 = c(1, 1, 1), check.names = FALSE)
test
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Your code (doesn't work)
test %>%
arrange("My variable")
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Solution
test %>%
arrange(`My variable`)
#> My variable var2
#> 1 1 1
#> 2 2 1
#> 3 3 1
Created on 2023-01-05 with reprex v2.0.2

Related

Creating serial number for unique entries in R

I wanted to assign same serial number for all same Submission_Ids under one Batch_number. Could some one please help me figure this out?
Submission_Id <- c(619295,619295,619295,619295,619296,619296,619296,619296,619296,556921,556921,559254,647327,647327,647327,646040,646040,646040,646040,646040,64604)
Batch_No <- (633,633,633,633,633,633,633,633,633,633,633,633,634,634,634,650,650,650,650,650,650)
Expected result
Sl.No <- c(1,1,1,1,2,2,2,2,2,3,3,4,1,1,1,1,1,1,1,1,1)
One way to do it is creating run-length IDs with data.table::rleid(Submission_Id) grouped_by(Batch_No). We can use this inside 'dplyr'. To show this I created a tibble() with both given vectors Batch_Id and Submission_Id.
library(dplyr)
library(data.table)
dat <- tibble(Submission_Id = Submission_Id,
Batch_No = Batch_No)
dat %>%
group_by(Batch_No) %>%
mutate(S1.No = data.table::rleid(Submission_Id))
#> # A tibble: 21 x 3
#> # Groups: Batch_No [3]
#> Submission_Id Batch_No S1.No
#> <dbl> <dbl> <int>
#> 1 619295 633 1
#> 2 619295 633 1
#> 3 619295 633 1
#> 4 619295 633 1
#> 5 619296 633 2
#> 6 619296 633 2
#> 7 619296 633 2
#> 8 619296 633 2
#> 9 619296 633 2
#> 10 556921 633 3
#> # ... with 11 more rows
The original data
Submission_Id <- c(619295,619295,619295,619295,619296,619296,619296,619296,619296,556921,556921,559254,647327,647327,647327,646040,646040,646040,646040,646040,64604)
Batch_No <- c(633,633,633,633,633,633,633,633,633,633,633,633,634,634,634,650,650,650,650,650,650)
Created on 2022-12-16 by the reprex package (v2.0.1)

Combining rows and generating category counts

I want to be able to first combine rows with a similar attribute into one(for example, one row for each City/Year), and then find the specific counts for types of categories for each of those rows.
For example, with this as the original data:
City Year Type of Death
NYC 1995 Homicide
NYC 1996 Homicide
NYC 1996 Suicide
LA 1995 Suicide
LA 1995 Homicide
LA 1995 Suicide
I want to be able to produce something like this:
City Year n_Total n_Homicides n_Suicides
NYC 1995 1 1 0
NYC 1996 2 1 1
LA 1995 3 1 2
I've tried something like the below, but it only gives me the n_Total and doesn't take into account the splits for n_Homicides and n_Suicides:
library(dplyr)
total_deaths <- data %>%
group_by(city, year)%>%
summarize(n_Total= n())
You may do this
library(tidyverse, warn.conflicts = F)
df <- read.table(header = T, text = 'City Year TypeofDeath
NYC 1995 Homicide
NYC 1996 Homicide
NYC 1996 Suicide
LA 1995 Suicide
LA 1995 Homicide
LA 1995 Suicide')
df %>%
pivot_wider(names_from = TypeofDeath, values_fn = length, values_from = TypeofDeath, values_fill = 0, names_prefix = 'n_') %>%
mutate(n_total = rowSums(select(cur_data(), starts_with('n_'))))
#> # A tibble: 3 x 5
#> City Year n_Homicide n_Suicide n_total
#> <chr> <int> <int> <int> <dbl>
#> 1 NYC 1995 1 0 1
#> 2 NYC 1996 1 1 2
#> 3 LA 1995 1 2 3
Created on 2021-07-05 by the reprex package (v2.0.0)
If you don't have too many types of death, then something simple (albeit a little "manual") like this might have some appeal.
library(dplyr, warn.conflicts = FALSE)
df <- read.table(header = TRUE, text = 'City Year TypeofDeath
NYC 1995 Homicide
NYC 1996 Homicide
NYC 1996 Suicide
LA 1995 Suicide
LA 1995 Homicide
LA 1995 Suicide')
df %>%
group_by(City, Year) %>%
summarize(n_Total = n(),
n_Suicide = sum(TypeofDeath == "Suicide"),
n_Homicide = sum(TypeofDeath == "Homicide"))
#> `summarise()` has grouped output by 'City'. You can override using the `.groups` argument.
#> # A tibble: 3 x 5
#> # Groups: City [2]
#> City Year n_Total n_Suicide n_Homicide
#> <chr> <int> <int> <int> <int>
#> 1 LA 1995 3 2 1
#> 2 NYC 1995 1 0 1
#> 3 NYC 1996 2 1 1
Created on 2021-07-05 by the reprex package (v2.0.0)
You can first dummify your factor variable using the fastDummies package, than summarise(). This is a more general and versatile approach that can be used seamlessly with any number of unique types of death.
If you only have two types of death and will settle for a simpler (though more "manual") approach, you can use the other suggestions with summarise(x=..., y=..., total=n()
library(dplyr)
library(fastDummies)
df%>%fastDummies::dummy_cols('TypeofDeath', remove_selected_columns = TRUE)%>%
group_by(City, Year)%>%
summarise(across(contains('Type'), sum),
total_deaths=n())
# A tibble: 3 x 5
# Groups: City [2]
City Year TypeofDeath_Homicide TypeofDeath_Suicide total_deaths
<chr> <int> <int> <int> <int>
1 LA 1995 1 2 3
2 NYC 1995 1 0 1
3 NYC 1996 1 1 2

Using dplyr mutate function to create new variable conditionally based on current row

I am working on creating conditional averages for a large data set that involves # of flu cases seen during the week for several years. The data is organized as such:
What I want to do is create a new column that tabulates that average number of cases for that same week in previous years. For instance, for the row where Week.Number is 1 and Flu.Year is 2017, I would like the new row to give the average count for any year with Week.Number==1 & Flu.Year<2017. Normally, I would use the case_when() function to conditionally tabulate something like this. For instance, when calculating the average weekly volume I used this code:
mutate(average = case_when(
Flu.Year==2016 ~ mean(chcc$count[chcc$Flu.Year==2016]),
Flu.Year==2017 ~ mean(chcc$count[chcc$Flu.Year==2017]),
Flu.Year==2018 ~ mean(chcc$count[chcc$Flu.Year==2018]),
Flu.Year==2019 ~ mean(chcc$count[chcc$Flu.Year==2019]),
),
However, since there are four years of data * 52 weeks which is a lot of iterations to spell out the conditions for. Is there a way to elegantly code this in dplyr? The problem I keep running into is that I want to call values in counts column based on Week.Number and Flu.Year values in other rows conditioned on the current value of Week.Number and Flu.Year, and I am not sure how to accomplish that. Please let me know if there is further information / detail I can provide.
Thanks,
Steven
dat <- tibble( Flu.Year = rep(2016:2019,each = 52), Week.Number = rep(1:52,4), count = sample(1000, size=52*4, replace=TRUE) )
It's bad-form and, in some cases, an error when you use $-indexing within dplyr verbs.
I think a better way to get that average field is to group_by(Flu.Year) and calculate it straight-up.
library(dplyr)
set.seed(42)
dat <- tibble(
Flu.Year = sample(2016:2020, size=100, replace=TRUE),
count = sample(1000, size=100, replace=TRUE)
)
dat %>%
group_by(Flu.Year) %>%
mutate(average = mean(count)) %>%
# just to show a quick summary
slice(1:3) %>%
ungroup()
# # A tibble: 15 x 3
# Flu.Year count average
# <int> <int> <dbl>
# 1 2016 734 578.
# 2 2016 356 578.
# 3 2016 411 578.
# 4 2017 217 436.
# 5 2017 453 436.
# 6 2017 920 436.
# 7 2018 963 558
# 8 2018 609 558
# 9 2018 536 558
# 10 2019 943 543.
# 11 2019 740 543.
# 12 2019 536 543.
# 13 2020 627 494.
# 14 2020 218 494.
# 15 2020 389 494.
An alternative approach is to generate a summary table (just one row per year) and join it back in to the original data.
dat %>%
group_by(Flu.Year) %>%
summarize(average = mean(count))
# # A tibble: 5 x 2
# Flu.Year average
# <int> <dbl>
# 1 2016 578.
# 2 2017 436.
# 3 2018 558
# 4 2019 543.
# 5 2020 494.
dat %>%
group_by(Flu.Year) %>%
summarize(average = mean(count)) %>%
full_join(dat, by = "Flu.Year")
# # A tibble: 100 x 3
# Flu.Year average count
# <int> <dbl> <int>
# 1 2016 578. 734
# 2 2016 578. 356
# 3 2016 578. 411
# 4 2016 578. 720
# 5 2016 578. 851
# 6 2016 578. 822
# 7 2016 578. 465
# 8 2016 578. 679
# 9 2016 578. 30
# 10 2016 578. 180
# # ... with 90 more rows
The result, after chat:
tibble( Flu.Year = rep(2016:2018,each = 3), Week.Number = rep(1:3,3), count = 1:9 ) %>%
arrange(Flu.Year, Week.Number) %>%
group_by(Week.Number) %>%
mutate(year_week.average = lag(cumsum(count) / seq_along(count)))
# # A tibble: 9 x 4
# # Groups: Week.Number [3]
# Flu.Year Week.Number count year_week.average
# <int> <int> <int> <dbl>
# 1 2016 1 1 NA
# 2 2016 2 2 NA
# 3 2016 3 3 NA
# 4 2017 1 4 1
# 5 2017 2 5 2
# 6 2017 3 6 3
# 7 2018 1 7 2.5
# 8 2018 2 8 3.5
# 9 2018 3 9 4.5
We can use aggregate from base R
aggregate(count ~ Flu.Year, data, FUN = mean)

Gather or transpose data with multiple rows as 'key' argument

In my mind I want to tidyr::gather() gather on not only the column names but also on row 1 and 2. What I want to achieve is to have a data frame with 5 columns and 4 rows.
This is a little piece of the dataset I'm working with:
library(tidyverse)
# A tibble: 4 x 3
Aanduiding `Coolsingel 40 links` `Goudseweg 15 links`
<chr> <chr> <chr>
1 Gebiedsnummer 1 2
2 Postcode 3011 AD 3031 XH
3 Leefbaar Rotterdam 124 110
4 Partij van de Arbeid (P.v.d.A.) 58 65
and its reproducable dput(df) to work with:
df <- structure(list(Aanduiding = c("Gebiedsnummer", "Postcode", "Leefbaar Rotterdam",
"Partij van de Arbeid (P.v.d.A.)"), `Coolsingel 40 links` = c("1",
"3011 AD", "124", "58"), `Goudseweg 15 links` = c("2", "3031 XH",
"110", "65")), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"), .Names = c("Aanduiding", "Coolsingel 40 links",
"Goudseweg 15 links"))
So wanted out put looks like this:
Aanduiding Gebiedsnummer Postcode adres value
<chr> <dbl> <chr> <chr> <dbl>
1 Leefbaar Rotterdam 1.00 3011 AD Coolsingel 40 links 124
2 Leefbaar Rotterdam 1.00 3031 XH Goudseweg 15 links 120
3 Partij van de Arbeid (P.v.d.A.) 2.00 3011 AD Coolsingel 40 links 58.0
4 Partij van de Arbeid (P.v.d.A.) 2.00 3031 XH Goudseweg 15 links 65.0
I use the gather() function from the tidyr package a lot, but this is alway when I only want to gather the column names with a certain value. Now I actually want to gather the column names but also observation on row 1 and 2.
Can I gather on multiple key's? Or paste the values in observation 1 and 2 to the column, then gather() and then separate()?
What's the best tactic here, if possible in a tidyr way.
Much appreciated.
There's two things that need to be done here, and you'll have to figure out how to break down your dataset accordingly.
data.frame(t(df[1:2,]))
will give you:
X1 X2
Aanduiding Gebiedsnummer Postcode
Coolsingel 40 links 1 3011 AD
Goudseweg 15 links 2 3031 XH
And
tidyr::gather(df[3:4,],key="adres",value="value", `Coolsingel 40 links`, `Goudseweg 15 links`)
will give you:
Aanduiding adres value
<chr> <chr> <chr>
1 Leefbaar Rotterdam Coolsingel 40 links 124
2 Partij van de Arbeid (P.v.d.A.) Coolsingel 40 links 58
3 Leefbaar Rotterdam Goudseweg 15 links 110
4 Partij van de Arbeid (P.v.d.A.) Goudseweg 15 links 65
How you go on from there is another problem, possibly a left_join based on adres, but that really depends on how the rest of the data is structured.
You can do this with a combination of gather and spread a few times. I do this often when I need to move a value out to serve as a denominator for a calculation.
library(tidyverse)
...
The goal is to move Gebiedsnummer and Postcode out of Aanduiding, and to gather the other two columns into one column of values. The first gather gets you this:
df %>%
gather(key = address, value = value, -Aanduiding)
#> # A tibble: 8 x 3
#> Aanduiding address value
#> <chr> <chr> <chr>
#> 1 Gebiedsnummer Coolsingel 40 links 1
#> 2 Postcode Coolsingel 40 links 3011 AD
#> 3 Leefbaar Rotterdam Coolsingel 40 links 124
#> 4 Partij van de Arbeid (P.v.d.A.) Coolsingel 40 links 58
#> 5 Gebiedsnummer Goudseweg 15 links 2
#> 6 Postcode Goudseweg 15 links 3031 XH
#> 7 Leefbaar Rotterdam Goudseweg 15 links 110
#> 8 Partij van de Arbeid (P.v.d.A.) Goudseweg 15 links 65
Using a spread after that gets:
df %>%
gather(key = address, value = value, -Aanduiding) %>%
spread(key = Aanduiding, value = value)
#> # A tibble: 2 x 5
#> address Gebiedsnummer `Leefbaar Rotter… `Partij van de Arbe… Postcode
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Coolsinge… 1 124 58 3011 AD
#> 2 Goudseweg… 2 110 65 3031 XH
Then you want to gather again, but to keep address, Gebiedsnummer, and Postcode as their own columns. The select is just there to get the columns in order. So all together:
df %>%
gather(key = address, value = value, -Aanduiding) %>%
spread(key = Aanduiding, value = value) %>%
gather(key = Aanduiding, value = value, -Gebiedsnummer, -address, -Postcode) %>%
select(Aanduiding, Gebiedsnummer, Postcode, address, value) %>%
mutate_at(vars(Gebiedsnummer, value), as.numeric)
#> # A tibble: 4 x 5
#> Aanduiding Gebiedsnummer Postcode address value
#> <chr> <dbl> <chr> <chr> <dbl>
#> 1 Leefbaar Rotterdam 1 3011 AD Coolsingel 40 l… 124
#> 2 Leefbaar Rotterdam 2 3031 XH Goudseweg 15 li… 110
#> 3 Partij van de Arbeid (P.v… 1 3011 AD Coolsingel 40 l… 58
#> 4 Partij van de Arbeid (P.v… 2 3031 XH Goudseweg 15 li… 65
Created on 2018-08-24 by the reprex package (v0.2.0).

Merge dataframes with some common columns and fill in others with NAs

I have around 50+ csv files that all share the same 4 columns in this order:
REG_ID region age age_num
and then years anything from 1990 till 2016 in this format:
REG_ID region age age_num y_1992 y_1993 y_1994 y_2014.15
and I was wondering what could be the best way to merge them. Going thru each to add the missing years-columns would be time consuming and likely lead to errors.
The end format would be something like this:
REG_ID region reg_num age age_num y_1991 y_1992 y_1993
BFM2 Boucle 1 c_0_4 0 770 NA 120
BFM2 Boucle 1 c_5_9 5 810 NA 11
BFM2 Boucle 1 c_10_14 10 704 NA 130
BFM2 Boucle 1 c_15_19 15 71 NA 512
BFM2 Boucle 1 c_20_24 20 181 NA 712
Here's a way you can do it using tidyverse tools. First use dir to get a vector of csv paths, then use purrr:map to read them all in, returning a list of the data frames, and then use purrr::reduce to merge all the data frames using dplyr::left_join.
library(readr)
library(purrr)
library(dplyr)
create the data sets
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1991
BFM2,Boucle,1,c_0_4,0,770
BFM2,Boucle,1,c_5_9,5,810
BFM2,Boucle,1,c_10_14,10,704
BFM2,Boucle,1,c_15_19,15,71
BFM2,Boucle,1,c_20_24,20,181") %>%
write_csv("df_91.csv")
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1992
BFM2,Boucle,1,c_0_4,0,NA
BFM2,Boucle,1,c_5_9,5,NA
BFM2,Boucle,1,c_10_14,10,NA
BFM2,Boucle,1,c_15_19,15,NA
BFM2,Boucle,1,c_20_24,20,NA") %>%
write_csv("df_92.csv")
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1993
BFM2,Boucle,1,c_0_4,0,120
BFM2,Boucle,1,c_5_9,5,11
BFM2,Boucle,1,c_10_14,10,130
BFM2,Boucle,1,c_15_19,15,512
BFM2,Boucle,1,c_20_24,20,712") %>%
write_csv("df_93.csv")
Create the final merged data set
dir(".", "\\.csv", full.names = TRUE) %>%
map(read_csv) %>%
reduce(left_join, by = c("REG_ID", "region", "reg_num", "age", "age_num"))
#> # A tibble: 5 x 8
#> REG_ID region reg_num age age_num y_1991 y_1992 y_1993
#> <chr> <chr> <int> <chr> <int> <int> <chr> <int>
#> 1 BFM2 Boucle 1 c_0_4 0 770 <NA> 120
#> 2 BFM2 Boucle 1 c_5_9 5 810 <NA> 11
#> 3 BFM2 Boucle 1 c_10_14 10 704 <NA> 130
#> 4 BFM2 Boucle 1 c_15_19 15 71 <NA> 512
#> 5 BFM2 Boucle 1 c_20_24 20 181 <NA> 712
I think the best way would be:
library(data.table)
library(stringr)
data<-list("vector")
files_to_loop<-list.vector()[str_detect(list.vector(),".csv")]
for (i in 1:length(files_to_loop)){
data[[i]]<-fread(files_to_loop[i])
}
data<-rbindlist(data,use.names=TRUE,fill=TRUE)

Resources