Min() ignoring zeros and NA with dplyr - r

I have a df that looks like this:
group year
1 2020
1 NA
1 0
2 2021
2 2006
3 NA
3 0
3 2010
3 2010
4 2006
4 2005
4 2010
And I want to group by group and then find the minimum year while ignoring NAs and 0 entries:
group year minYr
1 2020 2020
1 NA 2020
1 0 2020
2 2021 2006
2 2006 2006
3 NA 2010
3 0 2010
3 2010 2010
3 2010 2010
4 2006 2005
4 2005 2005
4 2010 2005
My initial approach
df <- df %>% group_by(group) %>% mutate (minYr = min(year, na.rm = TRUE)) caused a runtime error and didn't take care of the zeros.
Does anyone have a better way of doing this?

df1 %>%
group_by(group) %>%
mutate(minYr = min(year[year > 0], na.rm = TRUE)) %>%
# mutate(minYr = min(year[year > 0 & !is.na(year)])) %>% # equivalent
ungroup()
# A tibble: 12 × 3
group year minYr
<dbl> <dbl> <dbl>
1 1 2020 2020
2 1 NA 2020
3 1 0 2020
4 2 2021 2006
5 2 2006 2006
6 3 NA 2010
7 3 0 2010
8 3 2010 2010
9 3 2010 2010
10 4 2006 2005
11 4 2005 2005
12 4 2010 2005
df1 <- structure(list(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4),
year = c(2020, NA, 0, 2021, 2006, NA, 0, 2010, 2010, 2006, 2005, 2010)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -12L))

Related

How to create a column that is dependent on the average of previous observed events?

In the data below we observe a virtual GDP growth of a certain country over time. My aim is to create a variable with three categories: 0= no crisis, 1= crisis, 2= severe crisis. That would be identify
economic crises as years where the growth rate reaches at least one (crisis) or two (severe) standard deviations below the average of the previous 3-year growth trend.
Could someone give some guidance please?
growth year
5 1990
4 1991
0 1992
-4 1993
-3 1994
-1 1995
2 1996
4 1997
7 1998
10 1999
8 2000
-10 2001
-8 2002
2 2003
4 2004
5 2005
8 2006
4 2007
-10 2008
-9 2009
-8 2010
-3 2011
0 2012
-5 2013
-6 2014
-2 2015
4 2016
5 2017
5 2018
8 2019
2 2020
-1 2021
-1 2022
Here is the data:
df=structure(list(gdp_growth = c(5, 4, 0, -4, -3, -1, 2, 4, 7, 10,
8, -10, -8, 2, 4, 5, 8, 4, -10, -9, -8, -3, 0, -5, -6, -2, 4,
5, 5, 8, 2, -1, -1), year = c(1990, 1991, 1992, 1993, 1994, 1995,
1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
2018, 2019, 2020, 2021, 2022)), row.names = c(NA, -33L), class = "data.frame")
From your description, it sounds as though you first need to calculate the rolling mean of growth, then compare the current year's growth to this:
library(dplyr)
df %>%
mutate(mn = zoo::rollmean(gdp_growth, 3, na.pad = TRUE, align = 'right'),
crisis = ifelse(gdp_growth < (mn - sd(gdp_growth)),
ifelse(gdp_growth < (mn - 2 * sd(gdp_growth)),
2, 1), 0)) %>%
select(-mn)
#> gdp_growth year crisis
#> 1 5 1990 NA
#> 2 4 1991 NA
#> 3 0 1992 0
#> 4 -4 1993 0
#> 5 -3 1994 0
#> 6 -1 1995 0
#> 7 2 1996 0
#> 8 4 1997 0
#> 9 7 1998 0
#> 10 10 1999 0
#> 11 8 2000 0
#> 12 -10 2001 2
#> 13 -8 2002 0
#> 14 2 2003 0
#> 15 4 2004 0
#> 16 5 2005 0
#> 17 8 2006 0
#> 18 4 2007 0
#> 19 -10 2008 1
#> 20 -9 2009 0
#> 21 -8 2010 0
#> 22 -3 2011 0
#> 23 0 2012 0
#> 24 -5 2013 0
#> 25 -6 2014 0
#> 26 -2 2015 0
#> 27 4 2016 0
#> 28 5 2017 0
#> 29 5 2018 0
#> 30 8 2019 0
#> 31 2 2020 0
#> 32 -1 2021 0
#> 33 -1 2022 0
Here's another example, this time using RcppRoll package which has loads of fast rolling functions compatible with dplyr.
library(dplyr)
df %>%
mutate(
std3 = RcppRoll::roll_sd(gdp_growth , 3, fill=0, align = "right"),
crisis = case_when(
std3 < 1 ~ 'no crisis',
std3 < 2 ~ 'crisis',
T ~ 'severe crisis'
)
)
#> gdp_growth year std3 crisis
#> 1 5 1990 0.0000000 no crisis
#> 2 4 1991 0.0000000 no crisis
#> 3 0 1992 2.6457513 severe crisis
#> 4 -4 1993 4.0000000 severe crisis
#> 5 -3 1994 2.0816660 severe crisis
#> 6 -1 1995 1.5275252 crisis
#> 7 2 1996 2.5166115 severe crisis
#> 8 4 1997 2.5166115 severe crisis
#> 9 7 1998 2.5166115 severe crisis
#> 10 10 1999 3.0000000 severe crisis
#> 11 8 2000 1.5275252 crisis
#> 12 -10 2001 11.0151411 severe crisis
#> 13 -8 2002 9.8657657 severe crisis
#> 14 2 2003 6.4291005 severe crisis
#> 15 4 2004 6.4291005 severe crisis
#> 16 5 2005 1.5275252 crisis
#> 17 8 2006 2.0816660 severe crisis
#> 18 4 2007 2.0816660 severe crisis
#> 19 -10 2008 9.4516313 severe crisis
#> 20 -9 2009 7.8102497 severe crisis
#> 21 -8 2010 1.0000000 crisis
#> 22 -3 2011 3.2145503 severe crisis
#> 23 0 2012 4.0414519 severe crisis
#> 24 -5 2013 2.5166115 severe crisis
#> 25 -6 2014 3.2145503 severe crisis
#> 26 -2 2015 2.0816660 severe crisis
#> 27 4 2016 5.0332230 severe crisis
#> 28 5 2017 3.7859389 severe crisis
#> 29 5 2018 0.5773503 no crisis
#> 30 8 2019 1.7320508 crisis
#> 31 2 2020 3.0000000 severe crisis
#> 32 -1 2021 4.5825757 severe crisis
#> 33 -1 2022 1.7320508 crisis
Created on 2022-07-11 by the reprex package (v2.0.1)
You could use lag, rowwise*, and mutate within dplyr:
library(dplyr)
df |>
mutate(gdp3_growth_lag1 = lag(gdp_growth, 1),
gdp3_growth_lag2 = lag(gdp_growth, 2),
gdp3_growth_lag3 = lag(gdp_growth, 3)) |>
rowwise() |>
mutate(
gdp3_growth_mean = mean(c_across(starts_with("gdp3_growth_lag"))),
gdp3_growth_sd = sd(c_across(starts_with("gdp3_growth_lag")))
) |>
ungroup() |>
mutate(crisis = case_when(gdp_growth <= gdp3_growth_mean - 2 * gdp3_growth_sd ~ 2,
gdp_growth <= gdp3_growth_mean - gdp3_growth_sd ~ 1,
is.na(gdp3_growth_mean) ~ NA_real_,
TRUE ~ 0)) |>
select(-starts_with("gdp3"))
Output:
# A tibble: 33 × 3
gdp_growth year crisis
<dbl> <dbl> <dbl>
1 5 1990 NA
2 4 1991 NA
3 0 1992 NA
4 -4 1993 2
5 -3 1994 0
6 -1 1995 0
7 2 1996 0
8 4 1997 0
9 7 1998 0
10 10 1999 0
11 8 2000 0
12 -10 2001 2
13 -8 2002 0
14 2 2003 0
15 4 2004 0
16 5 2005 0
17 8 2006 0
18 4 2007 0
19 -10 2008 2
20 -9 2009 1
21 -8 2010 0
22 -3 2011 0
23 0 2012 0
24 -5 2013 0
25 -6 2014 1
26 -2 2015 0
27 4 2016 0
28 5 2017 0
29 5 2018 0
30 8 2019 0
31 2 2020 2
32 -1 2021 2
33 -1 2022 0
Updated with full output.
(*) There is also rowSds in matrixStats

How to replace missing values only for people who have positive values on the first observation in panel data?

I have four people who are followed for four years. I would like to replace the NA by 0, but only for people who has a positive value in workhours on the first wave they were interviewed. For example, in my data, this means that the persons with ID 3 and 4 will have their data replaced by 0, but the person with ID 2 will keep his/her NA.
id wave year work_hours
1 1 2007 40
1 2 2008 39
1 3 2009 39
1 4 2010 38
2 1 2005 NA
2 2 2006 35
2 3 2007 35
2 4 2008 NA
3 1 2007 40
3 2 2008 NA
3 3 2009 40
3 4 2010 40
4 1 2009 32
4 2 2010 NA
4 3 2011 32
4 4 2012 NA
I tried the following code, but it is replacing the first wave with 0 but not the waves that follows:
df= df %>% group_by(id) %>%
mutate(workhours_imputed= ifelse(work_hours>0 & wave==1, replace_na(0), work_hours))
Here Is the Data:
structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008,
2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), work_hours = c(40,
39, 39, 38, NA, 35, 35, NA, 40, NA, 40, 40, 32, NA, 32, NA),
workhours_imputed = c(0, 39, 39, 38, NA, 35, 35, NA, 0, NA,
40, 40, 0, NA, 32, NA)), row.names = c(NA, -16L), groups = structure(list(
id = c(1, 2, 3, 4), .rows = structure(list(1:4, 5:8, 9:12,
13:16), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Alternative dplyr solution:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & any(wave == 1 & !is.na(work_hours)),
0, work_hours)
)
# # A tibble: 16 x 5
# # Groups: id [4]
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
# 10 3 2 2008 NA 0
# 11 3 3 2009 40 40
# 12 3 4 2010 40 40
# 13 4 1 2009 32 32
# 14 4 2 2010 NA 0
# 15 4 3 2011 32 32
# 16 4 4 2012 NA 0
If wave does not always start at 1 but you always want to check the first value of wave, then you can use this instead:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & !is.na(work_hours[which.min(wave)]),
0, work_hours)
)
One way to do this using match -
library(dplyr)
df %>%
group_by(id) %>%
mutate(workhours_imputed = {
tmp <- work_hours[match(1, wave)]
#If the 1st wave has a positive value
#replace NA with 0
if(!is.na(tmp) && tmp > 0) replace(work_hours, is.na(work_hours), 0) else work_hours
})
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
#10 3 2 2008 NA 0
#11 3 3 2009 40 40
#12 3 4 2010 40 40
#13 4 1 2009 32 32
#14 4 2 2010 NA 0
#15 4 3 2011 32 32
#16 4 4 2012 NA 0

Remove duplicate year rows by groups [duplicate]

This question already has answers here:
get rows of unique values by group
(4 answers)
Closed 1 year ago.
I have a data.table of the following form:-
data <- data.table(group = rep(1:3, each = 4),
year = c(2011:2014, rep(2011:2012, each = 2),
2012, 2012, 2013, 2014), value = 1:12)
This is only an abstract of my data.
So group 2 has 2 values for 2011 and 2012. And group 3 has 2 values for the year 2012. I want to just keep the first row for all the duplicated years.
So, in effect, my data.table will become the following:-
data <- data.table(group = c(rep(1, 4), rep(2, 2), rep(3, 3)),
year = c(2011:2014, 2011, 2012, 2012, 2013, 2014),
value = c(1:5, 7, 9, 11, 12))
How can I achieve this? Thanks in advance.
Try this data.table option with duplicated
> data[!duplicated(cbind(group, year))]
group year value
1: 1 2011 1
2: 1 2012 2
3: 1 2013 3
4: 1 2014 4
5: 2 2011 5
6: 2 2012 7
7: 3 2012 9
8: 3 2013 11
9: 3 2014 12
For data.tables you can pass by argument to unique -
library(data.table)
unique(data, by = c('group', 'year'))
# group year value
#1: 1 2011 1
#2: 1 2012 2
#3: 1 2013 3
#4: 1 2014 4
#5: 2 2011 5
#6: 2 2012 7
#7: 3 2012 9
#8: 3 2013 11
#9: 3 2014 12
Using base R
subset(data, !duplicated(cbind(group, year)))
One solution would be to use distinct from dplyr like so:
library(dplyr)
data %>%
distinct(group, year, .keep_all = TRUE)
Output:
group year value
1: 1 2011 1
2: 1 2012 2
3: 1 2013 3
4: 1 2014 4
5: 2 2011 5
6: 2 2012 7
7: 3 2012 9
8: 3 2013 11
9: 3 2014 12
This should do the trick:
library(tidyverse)
data %>%
group_by(group, year) %>%
filter(!duplicated(group, year))

In R: How can I check that I have consecutive years of data (to later be able to calculate growth)?

I have the dataframe (sample) below:
companyID year yearID
1 2010 1
1 2011 2
1 2012 3
1 2013 4
2 2010 1
2 2011 2
2 2016 3
2 2017 4
2 2018 5
3 2010 1
3 2011 2
3 2014 3
3 2017 4
3 2018 5
I have used a for loop in order to try and create a sequence column that starts a new number for each new sequence of numbers. I am new to R so my definitions may be a bit wrong. My for loop looks like this:
size1 <- c(1:3)
s <- 0
for (val1 in size) {
m <- max(sample[sample$companyID == val1, 4])
size2 <- c(1:m)
for (val2 in size2){
row <- sample[which(sample$companyID == val1 & sample$yearID == val2)]
m1 <- sample[sample$companyID == val1 & sample$yearID == val2, 2]
m2 <- sample[sample$CompanyID == val1 & sample$yearID == (val2-1), 2]
if(val2>1 && m1-m2 > 1) {
sample$sequence[row] s = s+1}
else {s = s}
}
}
Where m is the max value of the yearID per companyID, row is to identify that the value should be entered on the row where companyID = val1 and yearID = val2, m1 is from the year variable and is the latter year, whereas m2 is the former year. What I have tried to do is to change the sequence every time m1-m2 > 1 (when val2 > 1 also).
Desired outcome:
companyID year yearID sequence
1 2010 1 1
1 2011 2 1
1 2012 3 1
1 2013 4 1
2 2010 1 2
2 2011 2 2
2 2016 3 3
2 2017 4 3
2 2018 5 3
3 2010 1 4
3 2011 2 4
3 2014 3 5
3 2017 4 6
3 2018 5 6
Super appreciative if anyone can help!!
This is a good question!
First group_by companyID
calculate the difference of each consecutive row in year column with lag to identify if year is consecutive.
group_by companyID, yearID)
mutate helper column sequence1 to apply 1 to each starting consecutive year in group.
ungroup and apply a sequence number eachtime 1
occurs in sequence1
remove column sequence1 and deltalag1
library(tidyverse)
df1 <- df %>%
group_by(companyID) %>%
mutate(deltaLag1 = year - lag(year, 1)) %>%
group_by(companyID, yearID) %>%
mutate(sequence1 = case_when(is.na(deltaLag1) | deltaLag1 > 1 ~ 1,
TRUE ~ 2)) %>%
ungroup() %>%
mutate(sequence = cumsum(sequence1==1)) %>%
select(-deltaLag1, -sequence1)
data
df <- tribble(
~companyID, ~year, ~yearID,
1, 2010, 1,
1, 2011, 2,
1, 2012, 3,
1, 2013, 4,
2, 2010, 1,
2, 2011, 2,
2, 2016, 3,
2, 2017, 4,
2, 2018, 5,
3, 2010, 1,
3, 2011, 2,
3, 2014, 3,
3, 2017, 4,
3, 2018, 5)
It's not clear if you want the exact desired outcome or check that you have consecutive years by companyID.
According to your title message:
sample <- read.table(header = TRUE, text = "
companyID year yearID
1 2010 1
1 2011 2
1 2012 3
1 2013 4
2 2010 1
2 2011 2
2 2016 3
2 2017 4
2 2018 5
3 2010 1
3 2011 2
3 2014 3
3 2017 4
3 2018 5
")
library(data.table)
sample <- setDT(sample)
sample[ , diff_year := year - shift(year), by = companyID]
sample <- setDF(sample)
sample
#> companyID year yearID diff_year
#> 1 1 2010 1 NA
#> 2 1 2011 2 1
#> 3 1 2012 3 1
#> 4 1 2013 4 1
#> 5 2 2010 1 NA
#> 6 2 2011 2 1
#> 7 2 2016 3 5
#> 8 2 2017 4 1
#> 9 2 2018 5 1
#> 10 3 2010 1 NA
#> 11 3 2011 2 1
#> 12 3 2014 3 3
#> 13 3 2017 4 3
#> 14 3 2018 5 1
# Created on 2021-03-13 by the reprex package (v1.0.0.9002)
Related to Calculate difference between values in consecutive rows by group
Regards,

add a column to a dataframe in r based on unique values row count

I have a consolidated table as given below:
> dput(data.frame(df))
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", "1.8L TFSI Quattro",
"320d", "320d", "4.2L VX AT", "2.4L LE MT", "2.4L LE MT", "2.3L VTI AT",
"2.3L VTI S", "1.8L SPORT", "1.8L V"), from_year = c(2014, 2008,
2004, 2011, 2012, 2015, 1998, 2001, 2006, 2001, 2001, 2009, 2006
), to_year = c(2020, 2012, 2008, 2016, 2015, 2020, 2003, 2006,
2011, 2003, 2003, 2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13)), class = "data.frame", row.names = c(NA,
-13L))
dataframe structure:
> df
make model variant from_year to_year id
1 audi A3 1.4L TFSI 2014 2020 1
2 audi A3 1.6L TFSI 2008 2012 2
3 audi A4 1.6L 2004 2008 3
4 audi A4 1.8L TFSI Quattro 2011 2016 4
5 bmw 3 Series 320d 2012 2015 5
6 bmw 3 Series 320d 2015 2020 6
7 toyota Land Cruiser 4.2L VX AT 1998 2003 7
8 toyota Camry 2.4L LE MT 2001 2006 8
9 toyota Camry 2.4L LE MT 2006 2011 9
10 honda Accord 2.3L VTI AT 2001 2003 10
11 honda Accord 2.3L VTI S 2001 2003 11
12 honda civic 1.8L SPORT 2009 2012 12
13 honda civic 1.8L V 2006 2009 13
I need to normalize the table based on make, model and variant, with primary key for each table
Expected Output:
> dput(df1)
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), make_id = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4),
model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), vehicle_model_id = c(1, 1, 2, 2, 3, 3, 4, 5, 5,
6, 6, 7, 7), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L",
"1.8L TFSI Quattro", "320d", "320d", "4.2L VX AT", "2.4L LE MT",
"2.4L LE MT", "2.3L VTI AT", "2.3L VTI S", "1.8L SPORT",
"1.8L V"), from_year = c(2014, 2008, 2004, 2011, 2012, 2015,
1998, 2001, 2006, 2001, 2001, 2009, 2006), to_year = c(2020,
2012, 2008, 2016, 2015, 2020, 2003, 2006, 2011, 2003, 2003,
2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13)), row.names = c(NA, -13L), class = c("tbl_df", "tbl",
"data.frame"))
> df1
# A tibble: 13 x 8
make make_id model vehicle_model_id variant from_year to_year id
<chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 audi 1 A3 1 1.4L TFSI 2014 2020 1
2 audi 1 A3 1 1.6L TFSI 2008 2012 2
3 audi 1 A4 2 1.6L 2004 2008 3
4 audi 1 A4 2 1.8L TFSI Quattro 2011 2016 4
5 bmw 2 3 Series 3 320d 2012 2015 5
6 bmw 2 3 Series 3 320d 2015 2020 6
7 toyota 3 Land Cruiser 4 4.2L VX AT 1998 2003 7
8 toyota 3 Camry 5 2.4L LE MT 2001 2006 8
9 toyota 3 Camry 5 2.4L LE MT 2006 2011 9
10 honda 4 Accord 6 2.3L VTI AT 2001 2003 10
11 honda 4 Accord 6 2.3L VTI S 2001 2003 11
12 honda 4 civic 7 1.8L SPORT 2009 2012 12
13 honda 4 civic 7 1.8L V 2006 2009 13
>
Could someone let me know how to get the new 'id' columns as shown above based on the unique value of strings in "make" and "model" columns.
Once I get the above output, I can then get the normalized tables as below.
> manufacturers
id make
1 audi
2 bmw
3 toyota
4 honda
> models
id make_id model
1 1 A3
2 1 A4
3 2 3 Series
4 3 Land Cruiser
5 3 Camry
6 4 Accord
7 4 civic
> variants
id name vehicle_model_id from_year to_year
1 1.4 TFSI 1 2014 2020
2 1.6 TFSI 1 2008 2012
3 1.6L 2 2004 2008
4 1.8L TFSI Quattro 2 2011 2016
..
..
Here's how you might do this using dplyr:
Manufacturers
library(dplyr)
df %>%
mutate(id = as.numeric(as.factor(make))) %>%
group_by(make) %>%
summarise(id = mean(id)) %>%
select(2:1)
#> # A tibble: 4 x 2
#> id make
#> <dbl> <chr>
#> 1 1 audi
#> 2 2 bmw
#> 3 3 honda
#> 4 4 toyota
Models
df %>% mutate(make_id = as.numeric(as.factor(make))) %>%
group_by(model) %>%
summarize(make_id = mean(make_id)) %>%
arrange(make_id) %>%
mutate(id = row_number()) %>%
select(c(3, 1, 2))
#> # A tibble: 7 x 3
#> id model make_id
#> <int> <chr> <dbl>
#> 1 1 A3 1
#> 2 2 A4 1
#> 3 3 3 Series 2
#> 4 4 Accord 3
#> 5 5 civic 3
#> 6 6 Camry 4
#> 7 7 Land Cruiser 4
Variants
df %>% mutate(model_id = as.numeric(as.factor(model))) %>%
group_by(variant) %>%
summarize(vehicle_model_id = mean(model_id),
from_year = from_year,
to_year = to_year) %>%
arrange(vehicle_model_id) %>%
ungroup() %>%
mutate(id = row_number()) %>%
select(5, 1:4)
#> # A tibble: 13 x 5
#> id variant vehicle_model_id from_year to_year
#> <int> <chr> <dbl> <dbl> <dbl>
#> 1 1 320d 1 2012 2015
#> 2 2 320d 1 2015 2020
#> 3 3 1.4L TFSI 2 2014 2020
#> 4 4 1.6L TFSI 2 2008 2012
#> 5 5 1.6L 3 2004 2008
#> 6 6 1.8L TFSI Quattro 3 2011 2016
#> 7 7 2.3L VTI AT 4 2001 2003
#> 8 8 2.3L VTI S 4 2001 2003
#> 9 9 2.4L LE MT 5 2001 2006
#> 10 10 2.4L LE MT 5 2006 2011
#> 11 11 1.8L SPORT 6 2009 2012
#> 12 12 1.8L V 6 2006 2009
#> 13 13 4.2L VX AT 7 1998 2003
Created on 2020-07-09 by the reprex package (v0.3.0)
library(dplyr)
df <- data.frame(values = c("A", "A", "B", "B", "C", "A"))
df <- df %>% mutate(id = as.numeric(factor(values)))
Just factorize then numerize the strings. If you want the numbers according to their appearance rather than alphabetical, you may define levels= based on unique values.
df <- transform(df,
make_id=as.numeric(factor(make, levels=unique(make))),
vehicle_model_id=as.numeric(factor(model, levels=unique(model))))
df
# make model variant from_year to_year id make_id vehicle_model_id
# 1 audi A3 1.4L TFSI 2014 2020 1 1 1
# 2 audi A3 1.6L TFSI 2008 2012 2 1 1
# 3 audi A4 1.6L 2004 2008 3 1 2
# 4 audi A4 1.8L TFSI Quattro 2011 2016 4 1 2
# 5 bmw 3 Series 320d 2012 2015 5 2 3
# 6 bmw 3 Series 320d 2015 2020 6 2 3
# 7 toyota Land Cruiser 4.2L VX AT 1998 2003 7 3 4
# 8 toyota Camry 2.4L LE MT 2001 2006 8 3 5
# 9 toyota Camry 2.4L LE MT 2006 2011 9 3 5
# 10 honda Accord 2.3L VTI AT 2001 2003 10 4 6
# 11 honda Accord 2.3L VTI S 2001 2003 11 4 6
# 12 honda civic 1.8L SPORT 2009 2012 12 4 7
# 13 honda civic 1.8L V 2006 2009 13 4 7

Resources