I have a multiple svytable in list and from that list I want to make them separate dataframes by saving the same data structure.
For example:
library(survey)
data(api)
x <- apiclus1
dclus1 <- svydesign(id=~dnum, weights=~pw, data=x, fpc=~fpc)
n <- c("sch.wide", "cname")
for(k in seq_along(n)){
assign((paste0( n[[k]], "_1")),((svytable(as.formula(paste0("~", n[[k]], "+stype")), design = dclus1, na.action=na.pass))))
}
list<- list(sch.wide_1, cname_1)
result <- lapply(list, function(x) ((prop.table(x, margin =2)*100)))
How to make the separate data frames from result list tables?
Edit: simplified approach modifying your for loop and with the use of janitor package
for(k in seq_along(n)) {
assign((paste0(n[[k]], "_1")), ((
svytable(
as.formula(paste0("~", n[[k]], "+stype")),
design = dclus1,
na.action = na.pass
) %>% as.data.frame() %>%
pivot_wider(names_from = stype, values_from = Freq) %>%
adorn_percentages("col") %>% adorn_pct_formatting()
)))
}
now you got:
> sch.wide_1
sch.wide E H M
No 8.3% 21.4% 32.0%
Yes 91.7% 78.6% 68.0%
> cname_1
cname E H M
Alameda 5.6% 7.1% 8.0%
Fresno 1.4% 7.1% 4.0%
Kern 0.7% 0.0% 4.0%
Los Angeles 8.3% 0.0% 12.0%
Mendocino 1.4% 7.1% 4.0%
Merced 1.4% 7.1% 4.0%
Orange 9.0% 0.0% 12.0%
Plumas 2.8% 28.6% 4.0%
San Diego 34.7% 14.3% 12.0%
San Joaquin 20.8% 21.4% 16.0%
Santa Clara 13.9% 7.1% 20.0%
you can explore janitor package and modify pct formatting, total,... to get your desired output.
not sure if you're going to do it 1 by one or you need a loop for it: here's one way for getting them separately:
a <- data.frame(result[1]) %>%
pivot_wider(names_from = stype, values_from = Freq)
> a
# A tibble: 2 × 4
sch.wide E H M
<fct> <dbl> <dbl> <dbl>
1 No 8.33 21.4 32
2 Yes 91.7 78.6 68
b <- data.frame(result[2]) %>%
pivot_wider(names_from = stype, values_from = Freq)
b
# A tibble: 11 × 4
cname E H M
<fct> <dbl> <dbl> <dbl>
1 Alameda 5.56 7.14 8
2 Fresno 1.39 7.14 4
3 Kern 0.694 0 4
4 Los Angeles 8.33 0 12
5 Mendocino 1.39 7.14 4
6 Merced 1.39 7.14 4
7 Orange 9.03 0 12
8 Plumas 2.78 28.6 4
9 San Diego 34.7 14.3 12
10 San Joaquin 20.8 21.4 16
11 Santa Clara 13.9 7.14 20
want to make a loop for it?
for (ii in 1:length(result)) {
assign(
paste0("df_", ii),
as.data.frame(result[[ii]]) %>%
pivot_wider(names_from = stype, values_from = Freq)
)
}
now you have df_1 and df_2
> df_1
# A tibble: 2 × 4
sch.wide E H M
<fct> <dbl> <dbl> <dbl>
1 No 8.33 21.4 32
2 Yes 91.7 78.6 68
> df_2
# A tibble: 11 × 4
cname E H M
<fct> <dbl> <dbl> <dbl>
1 Alameda 5.56 7.14 8
2 Fresno 1.39 7.14 4
3 Kern 0.694 0 4
4 Los Angeles 8.33 0 12
5 Mendocino 1.39 7.14 4
6 Merced 1.39 7.14 4
7 Orange 9.03 0 12
8 Plumas 2.78 28.6 4
9 San Diego 34.7 14.3 12
10 San Joaquin 20.8 21.4 16
11 Santa Clara 13.9 7.14 20
there might be a shortcut for it but this is how I'm doing so far. good luck
Related
I'm trying to complete a data.frame with scaled scores.
First I have a set of scores that relate to a grade, and a universal score that has been calculated.
library(dplyr)
df <- tibble(grade = c("X", "E", "D", "C", "B", "A", "Max"),
score = c(0,17,25,33,41,48,60),
universal = c(0,22,44,65,87,108,108))
I expand the frame to include all integer values of score
df %>% complete(score = full_seq(score, period = 1)) %>%
fill(grade, .direction = "down")
I now want to complete the universal score that relates to each integer score based on the relative steps between the previously defined universal scores for each grade.
This is based on a conversion/scaling factor:
(universal boundary for grade above - universal boundary below)/(score boundary grade above - score boundary grade below)
For the grade U this would be (22-0)/(17-0) = 1.29. Each previous score is summed with this factor to find the corresponding next universal score.
So the first part of the result should look like this:
score grade universal
0 U 0
1 U 1.29
2 U 2.59
3 U 3.88
4 U 5.18
5 U 6.47
6 U 7.76
7 U 9.06
8 U 10.35
9 U 11.65
10 U 12.94
11 U 14.24
12 U 15.53
13 U 16.82
14 U 18.12
15 U 19.41
16 U 20.71
17 N 22.00
I'm trying to achieve this with Tidy principles and various combinations of group_by(), complete(), seq(), etc., but haven't been able to achieve it in a neat way. I think my problem is that my max value is outside the grouping variable.
Any help will be much appreciated.
Base R has the approx function to do this linear interpolation. You can use it in a tidyverse context like this:
df %>%
complete(score = full_seq(score, period = 1)) %>%
fill(grade, .direction = "down") %>%
mutate(universal = approx(x=score,y=universal,xout=score)$y)
# A tibble: 61 × 3
score grade universal
<dbl> <chr> <dbl>
1 0 X 0
2 1 X 1.29
3 2 X 2.59
4 3 X 3.88
5 4 X 5.18
6 5 X 6.47
7 6 X 7.76
8 7 X 9.06
9 8 X 10.4
10 9 X 11.6
df %>% mutate(
inc = c(diff(universal) / diff(score), NA)
) %>%
complete(score = full_seq(score, period = 1)) %>%
fill(grade, inc, .direction = "down") %>%
group_by(grade) %>%
mutate(universal = first(universal) + (row_number() - 1) * inc) %>%
ungroup() %>%
print(n = 30)
# # A tibble: 61 × 4
# score grade universal inc
# <dbl> <chr> <dbl> <dbl>
# 1 0 X 0 1.29
# 2 1 X 1.29 1.29
# 3 2 X 2.59 1.29
# 4 3 X 3.88 1.29
# 5 4 X 5.18 1.29
# 6 5 X 6.47 1.29
# 7 6 X 7.76 1.29
# 8 7 X 9.06 1.29
# 9 8 X 10.4 1.29
# 10 9 X 11.6 1.29
# 11 10 X 12.9 1.29
# 12 11 X 14.2 1.29
# 13 12 X 15.5 1.29
# 14 13 X 16.8 1.29
# 15 14 X 18.1 1.29
# 16 15 X 19.4 1.29
# 17 16 X 20.7 1.29
# 18 17 E 22 2.75
# 19 18 E 24.8 2.75
# 20 19 E 27.5 2.75
# 21 20 E 30.2 2.75
# 22 21 E 33 2.75
# 23 22 E 35.8 2.75
# 24 23 E 38.5 2.75
# 25 24 E 41.2 2.75
# 26 25 D 44 2.62
# 27 26 D 46.6 2.62
# 28 27 D 49.2 2.62
# 29 28 D 51.9 2.62
# 30 29 D 54.5 2.62
# # … with 31 more rows
# # ℹ Use `print(n = ...)` to see more rows
Tried to find out avg delay in arrival for the nycflight2013 data set
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay))
but result showing NA except one
As said in the comments, you need to set na.rm = TRUE in your mean function. You can use the following code:
library(nycflights13)
library(tidyverse)
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay, na.rm = TRUE))
Output:
# A tibble: 16 × 2
carrier avg_dep_delay
<chr> <dbl>
1 9E 7.38
2 AA 0.364
3 AS -9.93
4 B6 9.46
5 DL 1.64
6 EV 15.8
7 F9 21.9
8 FL 20.1
9 HA -6.92
10 MQ 10.8
11 OO 11.9
12 UA 3.56
13 US 2.13
14 VX 1.76
15 WN 9.65
16 YV 15.6
Given a dataframe, how can I collapse several rows together and rename them as "other"? The function forcats::fct_lump() seems relevant in essence, but it operates on data with repetition, whereas my case is different.
Example
Some reproducible toy data about states in the USA.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(janitor, warn.conflicts = FALSE)
set.seed(2021)
create_weights <- function(x) ceiling(exp(-x/3)*1000)
df_uncounted <-
state.x77 %>%
as_tibble(rownames = "state") %>%
mutate(freq = sample(create_weights(1:n()), size = n())) %>%
uncount(freq)
df_counted <-
df_uncounted %>%
summarise(janitor::tabyl(state)) %>%
arrange(desc(percent))
df_counted
#> # A tibble: 50 x 3
#> state n percent
#> <chr> <dbl> <dbl>
#> 1 Oklahoma 717 0.280
#> 2 New York 514 0.201
#> 3 Indiana 368 0.144
#> 4 Maine 264 0.103
#> 5 Florida 189 0.0737
#> 6 Colorado 136 0.0531
#> 7 Alabama 97 0.0378
#> 8 Oregon 70 0.0273
#> 9 Minnesota 50 0.0195
#> 10 Tennessee 36 0.0140
#> # ... with 40 more rows
Created on 2021-08-11 by the reprex package (v2.0.0)
For the sake of this question, the dataset df_counted is given.
When I examine such data, I typically want to keep only rows that represent the larger chunks, and collapse the rest into "other". In this example, I may desire to collapse the data according to two scenarios.
Scenario A
Rows 1:4 are the ones I want to keep as-is, whereas rows 5:50 I'd collapse into "other".
Desired output for scenario A:
# A tibble: 5 x 3
state n percent
<chr> <dbl> <dbl>
1 Oklahoma 717 0.280
2 New York 514 0.201
3 Indiana 368 0.144
4 Maine 264 0.103
5 other 700 0.273
Scenario B
Any row that has a value lower than 0.01 in the percent column should be grouped as "other".
Desired output for scenario B
state n percent
<chr> <dbl> <dbl>
1 Oklahoma 717 0.280
2 New York 514 0.201
3 Indiana 368 0.144
4 Maine 264 0.103
5 Florida 189 0.0737
6 Colorado 136 0.0531
7 Alabama 97 0.0378
8 Oregon 70 0.0273
9 Minnesota 50 0.0195
10 Tennessee 36 0.0140
11 Montana 26 0.0101
12 other 96 0.0375
I suppose this is a pretty common procedure, but I didn't find a direct function that does so. My attempts to achieve the desired outputs included some very cumbersome code. way too complex for such a simple purpose.
Anyone knows of a straightforward way to achieve those desired outputs? Thanks!
It's just a case of creating appropriate groups and then summarising:
library(dplyr)
df_counted %>%
group_by(state = ifelse(row_number() < 5, state, "other")) %>%
summarise(across(everything(), sum)) %>%
arrange(state == "other", -n)
# A tibble: 5 x 3
state n percent
<chr> <dbl> <dbl>
1 Oklahoma 717 0.280
2 New York 514 0.201
3 Indiana 368 0.144
4 Maine 264 0.103
5 other 700 0.273
df_counted %>%
group_by(state = ifelse(percent >= .01, state, "other")) %>%
summarise(across(everything(), sum)) %>%
arrange(state == "other", -n)
# A tibble: 12 x 3
state n percent
<chr> <dbl> <dbl>
1 Oklahoma 717 0.280
2 New York 514 0.201
3 Indiana 368 0.144
4 Maine 264 0.103
5 Florida 189 0.0737
6 Colorado 136 0.0531
7 Alabama 97 0.0378
8 Oregon 70 0.0273
9 Minnesota 50 0.0195
10 Tennessee 36 0.0140
11 Montana 26 0.0101
12 other 96 0.0375
In case you have curiosity of knowing how to do it with forcats:
# scenario A
df_counted %>%
group_by(state = fct_lump_n(state, 4, w = percent)) %>%
summarise(across(.fn = sum)) %>%
arrange(state == "Other", -n)
# scenario B
df_counted %>%
group_by(state = fct_lump_min(state, .01, w = percent)) %>%
summarise(across(.fn = sum)) %>%
arrange(state == "Other",-n)
Base R options using aggregate
aggregate(
. ~ state,
transform(
df_counted,
state = replace(state, seq_along(state) >= 5, "other")
),
sum
)
which gives
state n percent
1 Indiana 368 0.1435817
2 Maine 264 0.1030043
3 New York 514 0.2005462
4 Oklahoma 717 0.2797503
5 other 700 0.2731174
aggregate(
. ~ state,
transform(
df_counted,
state = replace(state, percent < 0.01, "other")
),
sum
)
gives
state n percent
1 Alabama 97 0.03784627
2 Colorado 136 0.05306282
3 Florida 189 0.07374171
4 Indiana 368 0.14358174
5 Maine 264 0.10300429
6 Minnesota 50 0.01950839
7 Montana 26 0.01014436
8 New York 514 0.20054623
9 Oklahoma 717 0.27975029
10 Oregon 70 0.02731174
11 other 96 0.03745611
12 Tennessee 36 0.01404604
I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636
I often need to rescale time series relative to their value at a certain baseline time (usually as a percent of the baseline). Here's an example.
> library(dplyr)
> library(magrittr)
> library(tibble)
> library(tidyr)
# [messages from package imports snipped]
> set.seed(42)
> mexico <- tibble(Year=2000:2004, Country='Mexico', A=10:14+rnorm(5), B=20:24+rnorm(5))
> usa <- tibble(Year=2000:2004, Country='USA', A=30:34+rnorm(5), B=40:44+rnorm(5))
> table <- rbind(mexico, usa)
> table
# A tibble: 10 x 4
Year Country A B
<int> <chr> <dbl> <dbl>
1 2000 Mexico 11.4 19.9
2 2001 Mexico 10.4 22.5
3 2002 Mexico 12.4 21.9
4 2003 Mexico 13.6 25.0
5 2004 Mexico 14.4 23.9
6 2000 USA 31.3 40.6
7 2001 USA 33.3 40.7
8 2002 USA 30.6 39.3
9 2003 USA 32.7 40.6
10 2004 USA 33.9 45.3
I want to scale A and B to express each value as a percent of the country-specific 2001 value (i.e., the A and B entries in rows 2 and 7 should be 100). My way of doing this is somewhat roundabout and awkward: extract the baseline values into a separate table, merge them back into a separate column in the main table, and then compute scaled values, with annoying intermediate gathering and spreading to avoid specifying the column names of each time series (real data sets can have far more than two value columns). Is there a better way to do this, ideally with a single short pipeline?
> long_table <- table %>% gather(variable, value, -Year, -Country)
> long_table
# A tibble: 20 x 4
Year Country variable value
<int> <chr> <chr> <dbl>
1 2000 Mexico A 11.4
2 2001 Mexico A 10.4
#[remaining tibble printout snipped]
> baseline_table <- long_table %>%
filter(Year == 2001) %>%
select(-Year) %>%
rename(baseline=value)
> baseline_table
# A tibble: 4 x 3
Country variable baseline
<chr> <chr> <dbl>
1 Mexico A 10.4
2 USA A 33.3
3 Mexico B 22.5
4 USA B 40.7
> normalized_table <- long_table %>%
inner_join(baseline_table) %>%
mutate(value=100*value/baseline) %>%
select(-baseline) %>%
spread(variable, value) %>%
arrange(Country, Year)
Joining, by = c("Country", "variable")
> normalized_table
# A tibble: 10 x 4
Year Country A B
<int> <chr> <dbl> <dbl>
1 2000 Mexico 109. 88.4
2 2001 Mexico 100. 100
3 2002 Mexico 118. 97.3
4 2003 Mexico 131. 111.
5 2004 Mexico 138. 106.
6 2000 USA 94.0 99.8
7 2001 USA 100 100
8 2002 USA 92.0 96.6
9 2003 USA 98.3 99.6
10 2004 USA 102. 111.
My second attempt was to use transform, but this failed because transform doesn't seem to recognize dplyr groups, and it would be suboptimal even if it worked because it requires me to know that 2001 is the second year in the time series.
> table %>%
arrange(Country, Year) %>%
gather(variable, value, -Year, -Country) %>%
group_by(Country, variable) %>%
transform(norm=value*100/value[2])
Year Country variable value norm
1 2000 Mexico A 11.37096 108.9663
2 2001 Mexico A 10.43530 100.0000
3 2002 Mexico A 12.36313 118.4741
4 2003 Mexico A 13.63286 130.6418
5 2004 Mexico A 14.40427 138.0340
6 2000 USA A 31.30487 299.9901
7 2001 USA A 33.28665 318.9811
8 2002 USA A 30.61114 293.3422
9 2003 USA A 32.72121 313.5627
10 2004 USA A 33.86668 324.5395
11 2000 Mexico B 19.89388 190.6402
12 2001 Mexico B 22.51152 215.7247
13 2002 Mexico B 21.90534 209.9157
14 2003 Mexico B 25.01842 239.7480
15 2004 Mexico B 23.93729 229.3876
16 2000 USA B 40.63595 389.4085
17 2001 USA B 40.71575 390.1732
18 2002 USA B 39.34354 377.0235
19 2003 USA B 40.55953 388.6762
20 2004 USA B 45.32011 434.2961
It would be nice for this to be more scalable, but here's a simple solution. You can refer to A[Year == 2001] inside mutate, much as you might do table$A[table$Year == 2001] in base R. This lets you scale against your baseline of 2001 or whatever other year you might need.
Edit: I was missing a group_by to ensure that values are only being scaled against other values in their own group. The "sanity check" (that I clearly didn't do) is that values for Mexico in 2001 should have a scaled value of 1, and same for USA and any other countries.
library(tidyverse)
set.seed(42)
mexico <- tibble(Year=2000:2004, Country='Mexico', A=10:14+rnorm(5), B=20:24+rnorm(5))
usa <- tibble(Year=2000:2004, Country='USA', A=30:34+rnorm(5), B=40:44+rnorm(5))
table <- rbind(mexico, usa)
table %>%
group_by(Country) %>%
mutate(A_base2001 = A / A[Year == 2001], B_base2001 = B / B[Year == 2001])
#> # A tibble: 10 x 6
#> # Groups: Country [2]
#> Year Country A B A_base2001 B_base2001
#> <int> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 2000 Mexico 11.4 19.9 1.09 0.884
#> 2 2001 Mexico 10.4 22.5 1 1
#> 3 2002 Mexico 12.4 21.9 1.18 0.973
#> 4 2003 Mexico 13.6 25.0 1.31 1.11
#> 5 2004 Mexico 14.4 23.9 1.38 1.06
#> 6 2000 USA 31.3 40.6 0.940 0.998
#> 7 2001 USA 33.3 40.7 1 1
#> 8 2002 USA 30.6 39.3 0.920 0.966
#> 9 2003 USA 32.7 40.6 0.983 0.996
#> 10 2004 USA 33.9 45.3 1.02 1.11
Created on 2018-05-23 by the reprex package (v0.2.0).
Inspired by Camille's answer, I found one simple approach that that scales well:
table %>%
gather(variable, value, -Year, -Country) %>%
group_by(Country, variable) %>%
mutate(value=100*value/value[Year == 2001]) %>%
spread(variable, value)
# A tibble: 10 x 4
# Groups: Country [2]
Year Country A B
<int> <chr> <dbl> <dbl>
1 2000 Mexico 109. 88.4
2 2000 USA 94.0 99.8
3 2001 Mexico 100. 100
4 2001 USA 100 100
5 2002 Mexico 118. 97.3
6 2002 USA 92.0 96.6
7 2003 Mexico 131. 111.
8 2003 USA 98.3 99.6
9 2004 Mexico 138. 106.
10 2004 USA 102. 111.
Preserving the the original values alongside the scaled ones takes more work. Here are two approaches. One of them uses an extra gather call to produce two variable-name columns (one indicating the series name, the other marking original or scaled), then unifying them into one column and reformatting.
table %>%
gather(variable, original, -Year, -Country) %>%
group_by(Country, variable) %>%
mutate(scaled=100*original/original[Year == 2001]) %>%
gather(scaled, value, -Year, -Country, -variable) %>%
unite(variable_scaled, variable, scaled, sep='_') %>%
mutate(variable_scaled=gsub("_original", "", variable_scaled)) %>%
spread(variable_scaled, value)
# A tibble: 10 x 6
# Groups: Country [2]
Year Country A A_scaled B B_scaled
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 2000 Mexico 11.4 109. 19.9 88.4
2 2000 USA 31.3 94.0 40.6 99.8
3 2001 Mexico 10.4 100. 22.5 100
4 2001 USA 33.3 100 40.7 100
5 2002 Mexico 12.4 118. 21.9 97.3
6 2002 USA 30.6 92.0 39.3 96.6
7 2003 Mexico 13.6 131. 25.0 111.
8 2003 USA 32.7 98.3 40.6 99.6
9 2004 Mexico 14.4 138. 23.9 106.
10 2004 USA 33.9 102. 45.3 111.
A second equivalent approach creates a new table with the columns scaled "in place" and then merges it back into with the original one.
table %>%
gather(variable, value, -Year, -Country) %>%
group_by(Country, variable) %>%
mutate(value=100*value/value[Year == 2001]) %>%
ungroup() %>%
mutate(variable=paste(variable, 'scaled', sep='_')) %>%
spread(variable, value) %>%
inner_join(table)
Joining, by = c("Year", "Country")
# A tibble: 10 x 6
Year Country A_scaled B_scaled A B
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 2000 Mexico 109. 88.4 11.4 19.9
2 2000 USA 94.0 99.8 31.3 40.6
3 2001 Mexico 100. 100 10.4 22.5
4 2001 USA 100 100 33.3 40.7
5 2002 Mexico 118. 97.3 12.4 21.9
6 2002 USA 92.0 96.6 30.6 39.3
7 2003 Mexico 131. 111. 13.6 25.0
8 2003 USA 98.3 99.6 32.7 40.6
9 2004 Mexico 138. 106. 14.4 23.9
10 2004 USA 102. 111. 33.9 45.3
It's possible to replace the final inner_join with arrange(County, Year) %>% select(-Country, -Year) %>% bind_cols(table), which may perform better for some data sets, though it orders the columns suboptimally.