Merge dataframes with some common columns and fill in others with NAs - r

I have around 50+ csv files that all share the same 4 columns in this order:
REG_ID region age age_num
and then years anything from 1990 till 2016 in this format:
REG_ID region age age_num y_1992 y_1993 y_1994 y_2014.15
and I was wondering what could be the best way to merge them. Going thru each to add the missing years-columns would be time consuming and likely lead to errors.
The end format would be something like this:
REG_ID region reg_num age age_num y_1991 y_1992 y_1993
BFM2 Boucle 1 c_0_4 0 770 NA 120
BFM2 Boucle 1 c_5_9 5 810 NA 11
BFM2 Boucle 1 c_10_14 10 704 NA 130
BFM2 Boucle 1 c_15_19 15 71 NA 512
BFM2 Boucle 1 c_20_24 20 181 NA 712

Here's a way you can do it using tidyverse tools. First use dir to get a vector of csv paths, then use purrr:map to read them all in, returning a list of the data frames, and then use purrr::reduce to merge all the data frames using dplyr::left_join.
library(readr)
library(purrr)
library(dplyr)
create the data sets
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1991
BFM2,Boucle,1,c_0_4,0,770
BFM2,Boucle,1,c_5_9,5,810
BFM2,Boucle,1,c_10_14,10,704
BFM2,Boucle,1,c_15_19,15,71
BFM2,Boucle,1,c_20_24,20,181") %>%
write_csv("df_91.csv")
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1992
BFM2,Boucle,1,c_0_4,0,NA
BFM2,Boucle,1,c_5_9,5,NA
BFM2,Boucle,1,c_10_14,10,NA
BFM2,Boucle,1,c_15_19,15,NA
BFM2,Boucle,1,c_20_24,20,NA") %>%
write_csv("df_92.csv")
read_csv(
"REG_ID,region,reg_num,age,age_num,y_1993
BFM2,Boucle,1,c_0_4,0,120
BFM2,Boucle,1,c_5_9,5,11
BFM2,Boucle,1,c_10_14,10,130
BFM2,Boucle,1,c_15_19,15,512
BFM2,Boucle,1,c_20_24,20,712") %>%
write_csv("df_93.csv")
Create the final merged data set
dir(".", "\\.csv", full.names = TRUE) %>%
map(read_csv) %>%
reduce(left_join, by = c("REG_ID", "region", "reg_num", "age", "age_num"))
#> # A tibble: 5 x 8
#> REG_ID region reg_num age age_num y_1991 y_1992 y_1993
#> <chr> <chr> <int> <chr> <int> <int> <chr> <int>
#> 1 BFM2 Boucle 1 c_0_4 0 770 <NA> 120
#> 2 BFM2 Boucle 1 c_5_9 5 810 <NA> 11
#> 3 BFM2 Boucle 1 c_10_14 10 704 <NA> 130
#> 4 BFM2 Boucle 1 c_15_19 15 71 <NA> 512
#> 5 BFM2 Boucle 1 c_20_24 20 181 <NA> 712

I think the best way would be:
library(data.table)
library(stringr)
data<-list("vector")
files_to_loop<-list.vector()[str_detect(list.vector(),".csv")]
for (i in 1:length(files_to_loop)){
data[[i]]<-fread(files_to_loop[i])
}
data<-rbindlist(data,use.names=TRUE,fill=TRUE)

Related

dplyr arrange is not working while order is fine

I am trying to obtain the largest 10 investors in a country but obtain confusing result using arrange in dplyr versus order in base R.
head(fdi_partner)
give the following results
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Total registered capital (Mill. USD)(*)`
<chr> <chr> <chr>
1 TOTAL 1818 38854.3
2 Singapore 231 11358.66
3 Korea Rep.of 377 7679.9
4 Japan 204 4325.79
5 Netherlands 24 4209.64
6 China, PR 216 3001.79
and
fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric) %>%
arrange("Number of projects") %>%
head()
give almost the same result
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Singapore 231 11359.
3 Korea Rep.of 377 7680.
4 Japan 204 4326.
5 Netherlands 24 4210.
6 China, PR 216 3002.
while the following code is working fine with base R
head(fdi_partner)
fdi_numeric <- fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric)
head(fdi_numeric[order(fdi_numeric$"Number of projects", decreasing = TRUE), ], n=11)
which gives
# A tibble: 11 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Korea Rep.of 377 7680.
3 Singapore 231 11359.
4 China, PR 216 3002.
5 Japan 204 4326.
6 Hong Kong SAR (China) 132 2365.
7 United States 83 783.
8 Taiwan 66 1464.
9 United Kingdom 50 331.
10 F.R Germany 37 131.
11 Thailand 36 370.
Can anybody help explain what's wrong with me?
dplyr (and more generally tidyverse packages) accept only unquoted variable names. If your variable name has a space in it, you must wrap it in backticks:
library(dplyr)
test <- data.frame(`My variable` = c(3, 1, 2), var2 = c(1, 1, 1), check.names = FALSE)
test
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Your code (doesn't work)
test %>%
arrange("My variable")
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Solution
test %>%
arrange(`My variable`)
#> My variable var2
#> 1 1 1
#> 2 2 1
#> 3 3 1
Created on 2023-01-05 with reprex v2.0.2

Group and add variable of type stock and another type in a single step?

I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205

R Panel data: Create new variable based on ifelse() statement and previous row

My question refers to the following (simplified) panel data, for which I would like to create some sort of xrd_stock.
#Setup data
library(tidyverse)
firm_id <- c(rep(1, 5), rep(2, 3), rep(3, 4))
firm_name <- c(rep("Cosco", 5), rep("Apple", 3), rep("BP", 4))
fyear <- c(seq(2000, 2004, 1), seq(2003, 2005, 1), seq(2005, 2008, 1))
xrd <- c(49,93,121,84,37,197,36,154,104,116,6,21)
df <- data.frame(firm_id, firm_name, fyear, xrd)
#Define variables
growth = 0.08
depr = 0.15
For a new variable called xrd_stock I'd like to apply the following mechanics:
each firm_id should be handled separately: group_by(firm_id)
where fyear is at minimum, calculate xrd_stock as: xrd/(growth + depr)
otherwise, calculate xrd_stock as: xrd + (1-depr) * [xrd_stock from previous row]
With the following code, I already succeeded with step 1. and 2. and parts of step 3.
df2 <- df %>%
ungroup() %>%
group_by(firm_id) %>%
arrange(firm_id, fyear, decreasing = TRUE) %>% #Ensure that data is arranged w/ in asc(fyear) order; not required in this specific example as df is already in correct order
mutate(xrd_stock = ifelse(fyear == min(fyear), xrd/(growth + depr), xrd + (1-depr)*lag(xrd_stock))))
Difficulties occur in the else part of the function, such that R returns:
Error: Problem with `mutate()` input `xrd_stock`.
x object 'xrd_stock' not found
i Input `xrd_stock` is `ifelse(...)`.
i The error occured in group 1: firm_id = 1.
Run `rlang::last_error()` to see where the error occurred.
From this error message, I understand that R cannot refer to the just created xrd_stock in the previous row (logical when considering/assuming that R is not strictly working from top to bottom); however, when simply putting a 9 in the else part, my above code runs without any errors.
Can anyone help me with this problem so that results look eventually as shown below. I am more than happy to answer additional questions if required. Thank you very much to everyone in advance, who looks at my question :-)
Target results (Excel-calculated):
id name fyear xrd xrd_stock Calculation for xrd_stock
1 Cosco 2000 49 213 =49/(0.08+0.15)
1 Cosco 2001 93 274 =93+(1-0.15)*213
1 Cosco 2002 121 354 …
1 Cosco 2003 84 385 …
1 Cosco 2004 37 364 …
2 Apple 2003 197 857 =197/(0.08+0.15)
2 Apple 2004 36 764 =36+(1-0.15)*857
2 Apple 2005 154 803 …
3 BP 2005 104 452 …
3 BP 2006 116 500 …
3 BP 2007 6 431 …
3 BP 2008 21 388 …
arrange the data by fyear so minimum year is always the 1st row, you can then use accumulate to calculate.
library(dplyr)
df %>%
arrange(firm_id, fyear) %>%
group_by(firm_id) %>%
mutate(xrd_stock = purrr::accumulate(xrd[-1], ~.y + (1-depr) * .x,
.init = first(xrd)/(growth + depr)))
# firm_id firm_name fyear xrd xrd_stock
# <dbl> <chr> <dbl> <dbl> <dbl>
# 1 1 Cosco 2000 49 213.
# 2 1 Cosco 2001 93 274.
# 3 1 Cosco 2002 121 354.
# 4 1 Cosco 2003 84 385.
# 5 1 Cosco 2004 37 364.
# 6 2 Apple 2003 197 857.
# 7 2 Apple 2004 36 764.
# 8 2 Apple 2005 154 803.
# 9 3 BP 2005 104 452.
#10 3 BP 2006 116 500.
#11 3 BP 2007 6 431.
#12 3 BP 2008 21 388.

How to create a Markdown table with different column lengths based on a dataframe in long format in R?

I'm working on a R Markdown file that I would like to submit as a manuscript to an academic journal. I would like to create a table that shows which three words (item2) co-occur most frequently with some keywords (item1). Note that some key words have more than three co-occurring words. The data that I am currently working with:
item1 <- c("water","water","water","water","water","sun","sun","sun","sun","moon","moon","moon")
item2 <- c("tree","dog","cat","fish","eagle","bird","table","bed","flower","house","desk","tiger")
n <- c("200","83","34","34","34","300","250","77","77","122","46","46")
df <- data.frame(item1,item2,n)
Which gives this dataframe:
item1 item2 n
1 water tree 200
2 water dog 83
3 water cat 34
4 water fish 34
5 water eagle 34
6 sun bird 300
7 sun table 250
8 sun bed 77
9 sun flower 77
10 moon house 122
11 moon desk 46
12 moon tiger 46
Ultimately, I would like to pass the data to the function papaja::apa_table, which requires a data.frame (or a matrix / list). I therefore need to reshape the data.
My question:
How can I reshape the data (preferably with dplyr) to get the following structure?
water_item2 water_n sun_item2 sun_n moon_item2 moon_n
1 tree 200 bird 300 house 122
2 dog 83 table 250 desk 46
3 cat 34 bed 77 tiger 46
4 fish 34 flower 77 <NA> <NA>
5 eagle 34 <NA> <NA> <NA> <NA>
We can borrow an approach from an old answer of mine to a different question, and modify a classic gather(), unite(), spread() strategy by creating unique identifiers by group to avoid duplicate identifiers, then dropping that variable:
library(dplyr)
library(tidyr)
item1 <- c("water","water","water","water","water","sun","sun","sun","sun","moon","moon","moon")
item2 <- c("tree","dog","cat","fish","eagle","bird","table","bed","flower","house","desk","tiger")
n <- c("200","83","34","34","34","300","250","77","77","122","46","46")
# Owing to Richard Telford's excellent comment,
# I use data_frame() (or equivalently for our purposes,
# data.frame(..., stringsAsFactors = FALSE))
# to avoid turning the strings into factors
df <- data_frame(item1,item2,n)
df %>%
group_by(item1) %>%
mutate(id = 1:n()) %>%
ungroup() %>%
gather(temp, val, item2, n) %>%
unite(temp2, item1, temp, sep = '_') %>%
spread(temp2, val) %>%
select(-id)
# A tibble: 5 x 6
moon_item2 moon_n sun_item2 sun_n water_item2 water_n
<chr> <chr> <chr> <chr> <chr> <chr>
1 house 122 bird 300 tree 200
2 desk 46 table 250 dog 83
3 tiger 46 bed 77 cat 34
4 NA NA flower 77 fish 34
5 NA NA NA NA eagle 34

Iteration for time series data, using purrr

I have a bunch of time series data stacked on top of one another in a data frame; one series for each region in a country. I'd like to apply the seas() function (from the seasonal package) to each series, iteratively, to make the series seasonally adjusted. To do this, I first have to convert the series to a ts class. I'm struggling to do all this using purrr.
Here's a minimum worked example:
library(seasonal)
library(tidyverse)
set.seed(1234)
df <- data.frame(region = rep(1:10, each = 20),
quarter = rep(1:20, 10),
var = sample(5:200, 200, replace = T))
For each region (indexed by a number) I'd like to perform the following operations. Here's the first region as an example:
tem1 <- df %>% filter(region==1)
tem2 <- ts(data = tem1$var, frequency = 4, start=c(1990,1))
tem3 <- seas(tem2)
tem4 <- as.data.frame(tem3$data)
I'd then like to stack the output (ie. the multiple tem4 data frames, one for each region), along with the region and quarter identifiers.
So, the start of the output for region 1 would be this:
final seasonaladj trend irregular region quarter
1 27 27 96.95 -67.97279 1 1
2 126 126 96.95 27.87381 1 2
3 124 124 96.95 27.10823 1 3
4 127 127 96.95 30.55075 1 4
5 173 173 96.95 75.01355 1 5
6 130 130 96.95 32.10672 1 6
The data for region 2 would be below this etc.
I started with the following but without luck so far. Basically, I'm struggling to get the time series into the tibble:
seas.adjusted <- df %>%
group_by(region) %>%
mutate(data.ts = map(.x = data$var,
.f = as.ts,
start = 1990,
freq = 4))
I don't know much about the seasonal adjustment part, so there may be things I missed, but I can help with moving your calculations into a map-friendly function.
After grouping by region, you can nest the data so there's a nested data frame for each region. Then you can run essentially the same code as you had, but inside a function in map. Unnesting the resulting column gives you a long-shaped data frame of adjustments.
Like I said, I don't have the expertise to know whether those last two columns having NAs is expected or not.
Edit: Based on #wibeasley's question about retaining the quarter column, I'm adding a mutate that adds a column of the quarters listed in the nested data frame.
library(seasonal)
library(tidyverse)
set.seed(1234)
df <- data.frame(region = rep(1:10, each = 20),
quarter = rep(1:20, 10),
var = sample(5:200, 200, replace = T))
df %>%
group_by(region) %>%
nest() %>%
mutate(data.ts = map(data, function(x) {
tem2 <- ts(x$var, frequency = 4, start = c(1990, 1))
tem3 <- seas(tem2)
as.data.frame(tem3$data) %>%
mutate(quarter = x$quarter)
})) %>%
unnest(data.ts)
#> # A tibble: 200 x 8
#> region final seasonaladj trend irregular quarter seasonal adjustfac
#> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 1 27 27 97.0 -68.0 1 NA NA
#> 2 1 126 126 97.0 27.9 2 NA NA
#> 3 1 124 124 97.0 27.1 3 NA NA
#> 4 1 127 127 97.0 30.6 4 NA NA
#> 5 1 173 173 97.0 75.0 5 NA NA
#> 6 1 130 130 97.0 32.1 6 NA NA
#> 7 1 6 6 97.0 -89.0 7 NA NA
#> 8 1 50 50 97.0 -46.5 8 NA NA
#> 9 1 135 135 97.0 36.7 9 NA NA
#> 10 1 105 105 97.0 8.81 10 NA NA
#> # ... with 190 more rows
I also gave a bit more thought to doing this without nesting, and instead tried doing it with a split. Passing that list of data frames into imap_dfr let me take each split piece of the data frame and its name (in this case, the value of region), then return everything rbinded back together into one data frame. I sometimes shy away from nested data just because I have trouble seeing what's going on, so this is an alternative that is maybe more transparent.
df %>%
split(.$region) %>%
imap_dfr(function(x, reg) {
tem2 <- ts(x$var, frequency = 4, start = c(1990, 1))
tem3 <- seas(tem2)
as.data.frame(tem3$data) %>%
mutate(region = reg, quarter = x$quarter)
}) %>%
select(region, quarter, everything()) %>%
head()
#> region quarter final seasonaladj trend irregular seasonal adjustfac
#> 1 1 1 27 27 96.95 -67.97274 NA NA
#> 2 1 2 126 126 96.95 27.87378 NA NA
#> 3 1 3 124 124 96.95 27.10823 NA NA
#> 4 1 4 127 127 96.95 30.55077 NA NA
#> 5 1 5 173 173 96.95 75.01353 NA NA
#> 6 1 6 130 130 96.95 32.10669 NA NA
Created on 2018-08-12 by the reprex package (v0.2.0).
I put all the action inside of f(), and then called it with purrr::map_df(). The re-inclusion of quarter is a hack.
f <- function( .region ) {
d <- df %>%
dplyr::filter(region == .region)
y <- d %>%
dplyr::pull(var) %>%
ts(frequency = 4, start=c(1990,1)) %>%
seas()
y$data %>%
as.data.frame() %>%
# dplyr::select(-seasonal, -adjustfac) %>%
dplyr::mutate(
quarter = d$quarter
)
}
purrr::map_df(1:10, f, .id = "region")
results:
region final seasonaladj trend irregular quarter seasonal adjustfac
1 1 27.00000 27.00000 96.95000 -6.797279e+01 1 NA NA
2 1 126.00000 126.00000 96.95000 2.787381e+01 2 NA NA
3 1 124.00000 124.00000 96.95000 2.710823e+01 3 NA NA
4 1 127.00000 127.00000 96.95000 3.055075e+01 4 NA NA
5 1 173.00000 173.00000 96.95000 7.501355e+01 5 NA NA
6 1 130.00000 130.00000 96.95000 3.210672e+01 6 NA NA
7 1 6.00000 6.00000 96.95000 -8.899356e+01 7 NA NA
8 1 50.00000 50.00000 96.95000 -4.647254e+01 8 NA NA
9 1 135.00000 135.00000 96.95000 3.671077e+01 9 NA NA
10 1 105.00000 105.00000 96.95000 8.806955e+00 10 NA NA
...
96 5 55.01724 55.01724 60.25848 9.130207e-01 16 1.9084928 1.9084928
97 5 60.21549 60.21549 59.43828 1.013076e+00 17 1.0462424 1.0462424
98 5 58.30626 58.30626 58.87065 9.904130e-01 18 0.1715082 0.1715082
99 5 61.68175 61.68175 58.07827 1.062045e+00 19 1.0537962 1.0537962
100 5 59.30138 59.30138 56.70798 1.045733e+00 20 2.5294523 2.5294523
...

Resources