Add space after - in a value - r

Based on the data below how can I add a space after the special character - ? I know I have to use gsub but, it always confuses me so some explanation would be appreciated.
Sample data and code:
id = c (1,2,3,4,5,6,7,8,9,10)
FiscalYear = c("2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013",
"2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013")
# Sample
df = data.frame(id, FiscalYear)
# Updated Sample
df_new = df %>% gsub....
# str_pad does not work
df_updated = df %>% with(stringr::str_pad(FiscalYear, width = 6, pad = " "))

In the tidyverse, values are changed with mutate. The changes are arguments to this function.
suppressPackageStartupMessages(
library(dplyr)
)
id = c (1,2,3,4,5,6,7,8,9,10)
FiscalYear = c("2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013",
"2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013", "2012 -2013")
# Sample
df = data.frame(id, FiscalYear)
# Updated Sample
df_new <- df %>%
mutate(FiscalYear = sub("-", "- ", FiscalYear))
df_new
#> id FiscalYear
#> 1 1 2012 - 2013
#> 2 2 2012 - 2013
#> 3 3 2012 - 2013
#> 4 4 2012 - 2013
#> 5 5 2012 - 2013
#> 6 6 2012 - 2013
#> 7 7 2012 - 2013
#> 8 8 2012 - 2013
#> 9 9 2012 - 2013
#> 10 10 2012 - 2013
Created on 2022-11-04 with reprex v2.0.2

Related

Combine migration in and out data

I have two datasets, one for migration inflow to county A from other counties and other for migration outflow from county A to other counties. In order to combine the two data sets as:
Desired output:
Key County State FIPS Inflow Outflow FiscalYear Year
510012012 Accomack County VA 51001 NA 27 2011 - 2012 2012
160012012 Ada County ID 16001 16 16 2011 - 2012 2012
80012012 Adams County CO 8001 39 30 2011 - 2012 2012
80012011 Adams County CO 8001 42 31 2010 - 2011 2011
450032012 Aiken County SC 45003 NA 21 2011 - 2012 2012
120012012 Alachua County FL 12001 433 NA 2011 - 2012 2012
How can I combine the two into one dataset in such a way that I don't have to hardcode each and every common county and state name and FIPS and Year? Missing values would be NA.
The common value between the two data sets is the key.
My original migration outflow data has 517 observations and migration inflow has 441, thus different number of counties in each dataset.
Sample data:
# People moving out of county A to other counties
inflow_df = structure(list(Origin_FIPS = c(12001L, 8001L, 16001L, 12001L,
8001L, 16001L), Origin_StateName = c("FL", "CO", "ID", "FL",
"CO", "ID"), Origin_Place = c("Alachua County", "Adams County",
"Ada County", "Alachua County", "Adams County", "Ada County"),
InIndividuals = c(433L, 30L, 16L, 381L, 42L, 21L), FiscalYear = c("2011 - 2012",
"2011 - 2012", "2011 - 2012", "2011 - 2012", "2010 - 2011",
"2010 - 2011"), Year = c(2012L, 2012L, 2012L, 2011L, 2011L,
2011L), Key = c(120012012L, 80012012L, 160012012L, 120012011L,
80012011L, 160012011L)), class = "data.frame", row.names = c(NA,
-6L))
# People moving in county A from other counties
outflow_df = structure(list(Dest_FIPS = c(51001L, 16001L, 8001L, 8001L, 45003L
), Dest_StateName = c("VA", "ID", "CO", "CO", "SC"), Dest_Place = c("Accomack County",
"Ada County", "Adams County", "Adams County", "Aiken County"),
OutIndividuals = c(27L, 16L, 39L, 31L, 21L), FiscalYear = c("2011 - 2012",
"2011 - 2012", "2011 - 2012", "2011 - 2012", "2011 - 2012"
), Year = c(2012L, 2012L, 2012L, 2011L, 2012L), Key = c(510012012L,
160012012L, 80012012L, 80012011L, 450032012L)), class = "data.frame", row.names = c(NA,
-5L))
We can collate the two tables by giving them consistent names (presumably Origin_Place in one should match with Dest_Place in the other) and then performing a join. full_join outputs all the keys found in either table, in this case c("Key", "County", "State", "FIPS", "FiscalYear", "Year").
I would have expected that the inflow_df would reflect the counties that are seeing inflows (ie the destinations) and outflow_df would reflect the counties that have outflows (ie the origins), so it seems possible the table names are swapped in the question.
inflow2 <-
inflow_df %>%
transmute(Key,
County = Origin_Place,
State = Origin_StateName,
FIPS = Origin_FIPS,
Inflow = InIndividuals,
FiscalYear,
Year)
outflow2 <-
outflow_df %>%
transmute(Key,
County = Dest_Place,
State = Dest_StateName,
FIPS = Dest_FIPS,
Outflow = OutIndividuals,
FiscalYear,
Year)
inflow2 %>%
full_join(outflow2)
Result (updated with data from 2022-11-04)
Joining, by = c("Key", "County", "State", "FIPS", "FiscalYear", "Year")
Key County State FIPS Inflow FiscalYear Year Outflow
1 120012012 Alachua County FL 12001 433 2011 - 2012 2012 NA
2 80012012 Adams County CO 8001 30 2011 - 2012 2012 39
3 160012012 Ada County ID 16001 16 2011 - 2012 2012 16
4 120012011 Alachua County FL 12001 381 2011 - 2012 2011 NA
5 80012011 Adams County CO 8001 42 2010 - 2011 2011 NA
6 160012011 Ada County ID 16001 21 2010 - 2011 2011 NA
7 510012012 Accomack County VA 51001 NA 2011 - 2012 2012 27
8 80012011 Adams County CO 8001 NA 2011 - 2012 2011 31
9 450032012 Aiken County SC 45003 NA 2011 - 2012 2012 21

Create column based on ordering in another column in R

I have a dataframe which is a much longer version of this:
council_name <- c("Southwark", "Southwark", "Southwark", "Lambeth", "Lambeth", "Lambeth", "Yorkshire", "Yorkshire", "Yorkshire")
quarter <- c("2006 Q1", "2006 Q2", "2006 Q3", "2006 Q1", "2006 Q2", "2006 Q3","2006 Q1", "2006 Q2", "2006 Q3")
treat <- c(1, 0, 1, 0, 0, 1, 0, 0, 0)
df.desired <- as.data.frame(c(council_name, as.yearqtr(quarter), treat, df, first.treatment))
What I want is a column with the value of "quarter" when "treatment" is 1 for the first time for each value of "council_name". And is "0" if "treatment" is never 1 for a specific council_name.
This would like something like this:
library(zoo)
council_name <- c("Southwark", "Southwark", "Southwark", "Lambeth", "Lambeth", "Lambeth", "Yorkshire", "Yorkshire", "Yorkshire")
quarter <- c("2006 Q1", "2006 Q2", "2006 Q3", "2006 Q1", "2006 Q2", "2006 Q3","2006 Q1", "2006 Q2", "2006 Q3")
treat <- c(1, 0, 1, 0, 0, 1, 0, 0, 0)
first.treatment <- c("2006 Q1", "2006 Q3", 0)
df.desired <- as.data.frame <- c(council_name, as.yearqtr(quarter), treat, df, first.treatment)
I tried different things with group_by and sorting but I never quite get what I am looking for.
An example of what I tried is:
merged2%>%
group_by(council_name, year_qtr)%>%
arrange(year_qtr)%>%
mutate(first.treatment = by(year_qtr, head, 1))
but got:
Error: Problem with `mutate()` input `first.treatment`. x unique() applies only to vectors ℹ Input `first.treatment` is `by(year_qtr, head, 1)`. ℹ The error occured in group 1: council_name = "Adur", year_qtr = 2006 Q2.
Many thanks!
When using group_by, the mutate call will consider each variable in all groups successively.
Therefore, you can write something like this:
tibble(council_name, year_qtr=as.yearqtr(quarter), treat) %>%
group_by(council_name) %>%
arrange(year_qtr) %>%
mutate(first_treatment = year_qtr[treat==1][1]) %>%
arrange(council_name, year_qtr)
or
tibble(council_name, year_qtr=as.yearqtr(quarter), treat) %>%
group_by(council_name) %>%
arrange(year_qtr) %>%
summarise(first_treatment = year_qtr[treat==1][1])
For each group, this asks for the year_qtr column where treat==1, and takes the first value of the resulting vector. This is why it is important to sort beforehand (arrange).
I had do adapt the example data a bit but I am of goog hope, this is what you meant.
I do not like the idea to return either a string or 0. One should always return the same data type. That is why my answern returns either quarter or NA. Should you insist on returning 0 that could be easily "fixed" using is.na.
council_name <- c("Southwark", "Southwark", "Southwark", "Lambeth", "Lambeth", "Lambeth", "Yorkshire", "Yorkshire", "Yorkshire")
quarter <- c("2006 Q1", "2006 Q2", "2006 Q3", "2006 Q1", "2006 Q2", "2006 Q3","2006 Q1", "2006 Q2", "2006 Q3")
treat <- c(1, 0, 1, 0, 0, 1, 0, 0, 0)
df <- data.frame(council_name, quarter, treat)
treat.one <- function(d){
line <- which(d$treat == 1)[1]
return(d$quarter[line])
}
by(df, council_name, treat.one)
this takes
council_name quarter treat
1 Southwark 2006 Q1 1
2 Southwark 2006 Q2 0
3 Southwark 2006 Q3 1
4 Lambeth 2006 Q1 0
5 Lambeth 2006 Q2 0
6 Lambeth 2006 Q3 1
7 Yorkshire 2006 Q1 0
8 Yorkshire 2006 Q2 0
9 Yorkshire 2006 Q3 0
and returns
> by(df, council_name, treat.one)
council_name: Lambeth
[1] "2006 Q3"
-----------------------------------------
council_name: Southwark
[1] "2006 Q1"
-----------------------------------------
council_name: Yorkshire
[1] NA

How to keep only specific rows in a dataframe? [duplicate]

This question already has answers here:
Remove duplicated rows
(10 answers)
Closed 3 years ago.
I have a dataframe with 213 rows indicating quarters. Here just a chunck:
quart <- c("2000 Q1", "2000 Q1", "2000 Q1", "2000 Q1", "2000 Q2", "2000 Q2", "2000 Q2", "2000 Q3", "2000 Q3", "2000 Q4", "2000 Q4", "2000 Q4", "2000 Q4", "2001 Q1", "2001 Q1", "2001 Q2", "2001 Q2", "2001 Q2", "2001 Q2")
df <- data.frame(quart)
quart
1 2000 Q1
2 2000 Q1
3 2000 Q1
4 2000 Q1
5 2000 Q2
6 2000 Q2
7 2000 Q2
8 2000 Q3
9 2000 Q3
10 2000 Q4
11 2000 Q4
12 2000 Q4
13 2000 Q4
14 2001 Q1
15 2001 Q1
16 2001 Q2
17 2001 Q2
18 2001 Q2
19 2001 Q2
I would like to take just the first element of each new quarter. To make it clear:
quart
1 2000 Q1
2 2000 Q2
3 2000 Q3
4 2000 Q4
5 2001 Q1
6 2001 Q2
Can anyone help me?
Thanks!
One very simple method might be to simply use unique():
quart <- c("2000 Q1", "2000 Q1", "2000 Q1", "2000 Q1", "2000 Q2", "2000 Q2", "2000 Q2", "2000 Q3", "2000 Q3", "2000 Q4", "2000 Q4", "2000 Q4", "2000 Q4", "2001 Q1", "2001 Q1", "2001 Q2", "2001 Q2", "2001 Q2", "2001 Q2")
df <- data.frame(quart)
df2 <- unique(df)
You can use slice() on a grouped data frame via dplyr
library(dplyr)
df %>%
arrange(quart) %>%
group_by(quart) %>%
slice(1)
you could just ask for values that are not duplicated.
Want <- subset(have, !duplicated(have[,"quart"]))

applying str_split to a column in dataframe

I have the following df named i:
structure(list(price = c(11772, 14790, 2990, 1499, 21980, 27999
), fuel = c("diesel", "petrol", "petrol", "diesel", "diesel",
"petrol"), gearbox = c("manual", "manual", "manual", "manual",
"automatic", "manual"), colour = c("white", "purple", "yellow",
"silver", "red", "rising blue metalli"), engine_size = c(1685,
1199, 998, 1753, 2179, 1984), mileage = c(18839, 7649, 45058,
126000, 31891, 100), year = c("2013 hyundai ix35", "2016 citroen citroen ds3 cabrio",
"2007 peugeot 107 hatchback", "2007 ford ford focus hatchback", "2012 jaguar xf saloon",
"2016 volkswagen scirocco coupe"), doors = c(5, 2, 3, 5, 4, 3
)), .Names = c("price", "fuel", "gearbox", "colour", "engine_size",
"mileage", "year", "doors"), row.names = c(NA, 6L), class = "data.frame")
Some of the words in column 'year' are duplicated. I would like to remove them. As a first step I would like to separate the character string in this column in separate words.
I was able to do it for a separate string, but when I try to apply it to the whole data frame it gives an error
unlist(str_split( "2013 hyunday ix35", "[[:blank:]]"))
[1] "2013" "hyunday" "ix35"
for( k in 1:nrow(i))
+ i[k,7]<-unlist(str_split( i[k, 7], "[[:blank:]]"))
Error in [<-.data.frame(*tmp*, k, 7, value = c("2013", "hyundai", :
replacement has 3 rows, data has 1
We can split by one or more space (\\s+), and paste the unique elements together by looping through the list output (sapply(..)
i$year <- sapply(strsplit(i$year, "\\s+"), function(x) paste(unique(x), collapse=' '))
Working with dplyr and stringr (with help of purrr for working with list), you could do this :
library(dplyr)
df %>%
mutate(newyear = purrr::map_chr(
stringr::str_split(year, pattern = "[[:blank:]]"),
~ paste(unique(.x), collapse = " ")
))
#> price fuel gearbox colour engine_size mileage
#> 1 11772 diesel manual white 1685 18839
#> 2 14790 petrol manual purple 1199 7649
#> 3 2990 petrol manual yellow 998 45058
#> 4 1499 diesel manual silver 1753 126000
#> 5 21980 diesel automatic red 2179 31891
#> 6 27999 petrol manual rising blue metalli 1984 100
#> year doors newyear
#> 1 2013 hyundai ix35 5 2013 hyundai ix35
#> 2 2016 citroen citroen ds3 cabrio 2 2016 citroen ds3 cabrio
#> 3 2007 peugeot 107 hatchback 3 2007 peugeot 107 hatchback
#> 4 2007 ford ford focus hatchback 5 2007 ford focus hatchback
#> 5 2012 jaguar xf saloon 4 2012 jaguar xf saloon
#> 6 2016 volkswagen scirocco coupe 3 2016 volkswagen scirocco coupe

How to create a function and a loop to calculate growth rates of variables in a data frame in R

New to R and Stack Overflow. Suppose I have the following macroeconomic data loaded into a data frame called testdata in R.
> testdata
date gdp cpi_index rpi_index
21 2013 Q1 409985 125.067 247.4
22 2013 Q2 412620 125.971 249.7
23 2013 Q3 415577 126.352 250.9
24 2013 Q4 417265 127.123 252.5
25 2014 Q1 420091 127.241 253.9
26 2014 Q2 423249 128.139 256.0
27 2014 Q3 426022 128.191 256.9
28 2014 Q4 428347 128.312 257.4
I want to generate a new data called testdata_growth which contains the q-o-q growth rates for the macro variables in testdata. Currently my way of going about this is the following:
# Generating q-o-q growth rates
gdp_growth <- c(NA, diff(testdata$gdp)/ testdata$gdp[-1])
rpi_index_growth <- c(NA, diff(testdata$rpi_index)/ testdata$rpi_index[-1])
cpi_index_growth <- c(NA, diff(testdata$cpi_index)/ testdata$cpi_index[-1])
# Combining growth rates into a new data frame
testdata_growth <- data.frame(testdata$date, gdp_growth, rpi_index_growth, cpi_index_growth)
My question is how I can code the above into a loop, so that I can generate the new data frame with growth rates quicker (as I have dozens of macroeconomic variables that I need to apply this growth rate calculation to).
Any assistance would be greatly appreciated.
Thanks!
(Also, if you have any comments on how to improve my question, I would take these into consideration the next time I post onto Stack Overflow - many thanks!)
Edit: Added dput(testdata) below
> dput(testdata)
structure(list(date = structure(21:28, .Label = c("2008 Q1",
"2008 Q2", "2008 Q3", "2008 Q4", "2009 Q1", "2009 Q2", "2009 Q3",
"2009 Q4", "2010 Q1", "2010 Q2", "2010 Q3", "2010 Q4", "2011 Q1",
"2011 Q2", "2011 Q3", "2011 Q4", "2012 Q1", "2012 Q2", "2012 Q3",
"2012 Q4", "2013 Q1", "2013 Q2", "2013 Q3", "2013 Q4", "2014 Q1",
"2014 Q2", "2014 Q3", "2014 Q4"), class = "factor"), gdp = c(409985L,
412620L, 415577L, 417265L, 420091L, 423249L, 426022L, 428347L
), cpi_index = c(125.067, 125.971, 126.352, 127.123, 127.241,
128.139, 128.191, 128.312), rpi_index = c(247.4, 249.7, 250.9,
252.5, 253.9, 256, 256.9, 257.4)), .Names = c("date", "gdp",
"cpi_index", "rpi_index"), row.names = 21:28, class = "data.frame")
You can use data.table too. data.table is a very powerful data manipulation package. You can get started here.
library("data.table")
as.data.table(testdata)[, lapply(.SD, function(x)x/shift(x) - 1), .SDcols = 2:4]
gdp cpi_index rpi_index
1: NA NA NA
2: 0.006427064 0.0072281257 0.009296686
3: 0.007166400 0.0030245056 0.004805767
4: 0.004061822 0.0061020008 0.006377043
5: 0.006772674 0.0009282349 0.005544554
6: 0.007517419 0.0070574736 0.008270973
7: 0.006551699 0.0004058093 0.003515625
8: 0.005457465 0.0009439040 0.001946283
library(dplyr)
testdata %>%
select(-date) %>%
mutate_each(funs(. / lag(.) - 1))
# gdp cpi_index rpi_index
# 1 NA NA NA
# 2 0.006427064 0.0072281257 0.009296686
# 3 0.007166400 0.0030245056 0.004805767
# 4 0.004061822 0.0061020008 0.006377043
# 5 0.006772674 0.0009282349 0.005544554
# 6 0.007517419 0.0070574736 0.008270973
# 7 0.006551699 0.0004058093 0.003515625
# 8 0.005457465 0.0009439040 0.001946283
Couldn't resist...
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
testdata %>%
select(-date) %>%
mutate_each(funs(. / lag(.) - 1)) %>%
bind_cols(testdata[1], .) %>%
gather(index, value, -date) %>%
ggplot(., aes(x = date, y = value,
color = factor(index),
group = factor(index))) +
geom_line() +
scale_y_continuous(labels = percent)
You can calculate it from the differences of the logged values.
cbind(testdata[1],sapply(testdata[-1], function(x) c(0,exp(diff(log(x)))-1)))
date gdp cpi_index rpi_index
21 2013 Q1 0.000000000 0.0000000000 0.000000000
22 2013 Q2 0.006427064 0.0072281257 0.009296686
23 2013 Q3 0.007166400 0.0030245056 0.004805767
24 2013 Q4 0.004061822 0.0061020008 0.006377043
25 2014 Q1 0.006772674 0.0009282349 0.005544554
26 2014 Q2 0.007517419 0.0070574736 0.008270973
27 2014 Q3 0.006551699 0.0004058093 0.003515625
28 2014 Q4 0.005457465 0.0009439040 0.001946283
A data.table solution that adds the growth columns directly to the dataset via a loop, using a new column name created in the loop (column_growth).
list.of.columns = names of the columns for which you'd like growth rates.
Remove , by=group_ID if you don't want to calculate the rates by a group.
library(data.table)
for (col in list.of.columns){
growth.col.name = paste0(col, '_growth')
df[,eval(growth.col.name):= get(col)/shift(get(col)) - 1, by=group_ID]
}

Resources