I have a dataframe with rows grouped by Year. Variables don't always have observations in each year but when they do, there are 3 observations in that year but appear in different rows.
> na_data
Year Peter Paul John
1 2011 1 NA NA
2 2011 2 NA NA
3 2011 3 NA NA
4 2011 NA 1 NA
5 2011 NA 2 NA
6 2011 NA 3 NA
7 2012 1 NA NA
8 2012 NA 3 NA
9 2012 2 NA NA
10 2012 NA 2 NA
11 2012 3 NA NA
12 2012 NA 1 NA
13 2013 NA 1 4
14 2013 NA 2 5
15 2013 NA 3 6
16 2013 1 NA NA
17 2013 2 NA NA
18 2013 3 NA NA
I want to remove the NAs in each column by group. Such that the output looks like this:
final_data
Year Peter Paul John
[1,] 2011 1 1 NA
[2,] 2011 2 2 NA
[3,] 2011 3 3 NA
[4,] 2012 1 3 NA
[5,] 2012 2 2 NA
[6,] 2012 3 1 NA
[7,] 2013 1 1 4
[8,] 2013 2 2 5
[9,] 2013 3 3 6
So far I have used a loop but I am looking for a cleaner solution if anyone can help that would be great. My solution:
cleaned_list <- vector("list", length(unique(full_data$Year)))
names(cleaned_list) <- unique(full_data$Year)
for(yr in unique(na_data$Year)) {
temp <- matrix(NA, nrow = 3, ncol = ncol(na_data),
dimnames = list(NULL, colnames(na_data)))
for(name in colnames(na_data)[-1]){
no_nas <- as.vector(na.omit(na_data[Year==yr, name]))
if (length(no_nas)!=0) temp[,name] <- no_nas
}
temp[,1] <- yr
cleaned_list[[as.character(yr)]] <- temp
}
final_data <- do.call("rbind", cleaned_list)
Data:
na_data <- data.frame(
Year = rep(c(2011,2012,2013), each = 6),
Peter = c(1:3, rep(NA, 3), 1,NA,2,NA,3,NA, rep(NA, 3),1:3),
Paul = c(rep(NA,3), 1:3, NA,3,NA,2,NA, 1, 1:3, rep(NA,3)),
John = c(rep(NA, 12), 4:6, rep(NA, 3))
)
desired <- data.frame(
Year = rep(c(2011,2012,2013), each = 3),
Peter = c(1:3, 1:3, 1:3),
Paul = c( 1:3, 3:1, 1:3),
John = c(rep(NA, 6), 4:6)
) # same as final_data but a dataframe
Here is one possible solution using data.table package:
library(data.table)
setDT(na_data)[, lapply(.SD, function(x) if(length(y<-na.omit(x))) y else first(x)), by=Year]
# Year Peter Paul John
# 1: 2011 1 1 NA
# 2: 2011 2 2 NA
# 3: 2011 3 3 NA
# 4: 2012 1 3 NA
# 5: 2012 2 2 NA
# 6: 2012 3 1 NA
# 7: 2013 1 1 4
# 8: 2013 2 2 5
# 9: 2013 3 3 6
dplyr equivalent:
library(dplyr)
na_data |>
group_by(Year) |>
summarise(across(.fns = ~ if(length(y<-na.omit(.x))) y else first(.x)))
# # A tibble: 9 x 4
# # Groups: Year [3]
# Year Peter Paul John
# <dbl> <dbl> <dbl> <int>
# 1 2011 1 1 NA
# 2 2011 2 2 NA
# 3 2011 3 3 NA
# 4 2012 1 3 NA
# 5 2012 2 2 NA
# 6 2012 3 1 NA
# 7 2013 1 1 4
# 8 2013 2 2 5
# 9 2013 3 3 6
Convert to long form, remove the NA's, add a sequence number n, convert back and remove n.
library(dplyr)
library(tidyr)
na_data %>%
pivot_longer(-Year) %>%
drop_na %>%
group_by(Year, name) %>%
mutate(n = 1:n()) %>%
ungroup %>%
pivot_wider %>%
select(-n)
giving:
# A tibble: 9 x 4
Year Paul Peter John
<dbl> <dbl> <dbl> <dbl>
1 2011 1 1 NA
2 2011 2 2 NA
3 2011 3 3 NA
4 2012 1 1 NA
5 2012 2 2 NA
6 2012 3 3 NA
7 2013 1 1 4
8 2013 2 2 5
9 2013 3 3 6
Related
This question already has answers here:
How can I automatically create n lags in a timeseries?
(3 answers)
Closed 1 year ago.
Given this tibble:
tibble(x = c(1:9))
I want to add a column x_lag_1 = c(NA,1:8), a column x_lag_2 = c(NA,NA,1:7), etc.
Up to x_lag_n.
This can be quick with data.table:
library(data.table)
n <- seq(4)
setDT(df)[, paste0('x_lag_', n) := shift(x, n)]
df
x x_lag_1 x_lag_2 x_lag_3 x_lag_4
1: 1 NA NA NA NA
2: 2 1 NA NA NA
3: 3 2 1 NA NA
4: 4 3 2 1 NA
5: 5 4 3 2 1
6: 6 5 4 3 2
7: 7 6 5 4 3
8: 8 7 6 5 4
9: 9 8 7 6 5
You may use map_dfc to add n new columns.
library(dplyr)
library(purrr)
df <- tibble(x = c(1:9))
n <- 3
bind_cols(df, map_dfc(seq_len(n), ~df %>%
transmute(!!paste0('x_lag', .x) := lag(x, .x))))
# x x_lag1 x_lag2 x_lag3
# <int> <int> <int> <int>
#1 1 NA NA NA
#2 2 1 NA NA
#3 3 2 1 NA
#4 4 3 2 1
#5 5 4 3 2
#6 6 5 4 3
#7 7 6 5 4
#8 8 7 6 5
#9 9 8 7 6
Edit 2: Reworked the answer to contemplate the case of a grouped df.
library(tidyverse)
set.seed(123)
df <- tibble(group = sample(letters[1:3], 30, replace = TRUE), x = c(1:30))
formulas <- seq(3, 12, 3) %>%
map(~ as.formula(str_glue("~lag(.,n={.x})"))) %>%
set_names(str_c("lag", seq(3, 12, 3)))
df %>%
summarise(x, across(x, lst(!!!formulas)))
#> # A tibble: 30 × 5
#> x x_lag3 x_lag6 x_lag9 x_lag12
#> <int> <int> <int> <int> <int>
#> 1 1 NA NA NA NA
#> 2 2 NA NA NA NA
#> 3 3 NA NA NA NA
#> 4 4 1 NA NA NA
#> 5 5 2 NA NA NA
#> 6 6 3 NA NA NA
#> 7 7 4 1 NA NA
#> 8 8 5 2 NA NA
#> 9 9 6 3 NA NA
#> 10 10 7 4 1 NA
#> # … with 20 more rows
df %>%
group_by(group) %>%
summarise(x, across(x, lst(!!!formulas)), .groups = "drop")
#> # A tibble: 30 × 6
#> group x x_lag3 x_lag6 x_lag9 x_lag12
#> <chr> <int> <int> <int> <int> <int>
#> 1 a 10 NA NA NA NA
#> 2 a 13 NA NA NA NA
#> 3 a 16 NA NA NA NA
#> 4 a 19 10 NA NA NA
#> 5 a 20 13 NA NA NA
#> 6 a 21 16 NA NA NA
#> 7 a 22 19 10 NA NA
#> 8 a 27 20 13 NA NA
#> 9 b 4 NA NA NA NA
#> 10 b 6 NA NA NA NA
#> # … with 20 more rows
Created on 2021-12-30 by the reprex package (v2.0.1)
I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2
I've been using wide table format to create a migration variable (year, municipality -> year, municipality, move) and was wondering if I can flip it back into long table format. However, I now 2 groups per year instead of one. I looked through the existing posts on SO, but couldn't find anything similar.
Here's what I have done:
library(tidyverse)
library(rlang)
# sample data
mydata <- data.frame(id = sort(rep(1:10,3)),
year = rep(seq(2009,2011),10),
municip = sample(c(NA,1:3),30,replace=TRUE))
The data looks like this:
id
year
municip
1
2009
2
1
2010
1
1
2011
3
2
2009
1
2
2010
1
2
2011
3
3
2009
NA
3
2010
NA
3
2011
NA
# turn sideways
mydata.wide <- mydata %>%
pivot_wider(names_from = year,
names_prefix = "municip.",
values_from = municip)
Now it looks like this:
id
municip.2009
municip.2010
municip.2011
1
2
1
3
2
1
1
3
3
NA
NA
NA
4
1
NA
3
5
1
NA
2
6
3
2
2
7
2
NA
3
8
3
NA
3
9
NA
1
NA
10
1
NA
2
Then I'm adding a migration variable (in reality this is done for 12 years):
# create migration variable
for (i in 2009:2010){
text.string <- paste0("mydata.wide <- mydata.wide %>%
mutate(move.",i+1," = case_when(
is.na(municip.",i,") & is.na(municip.",i+1,") ~ \"NA\",
is.na(municip.",i,") & !is.na(municip.",i+1,") ~ \"1\",
!is.na(municip.",i,") & !is.na(municip.",i+1,")
& municip.",i," != municip.",i+1," ~ \"3\",
!is.na(municip.",i,") & is.na(municip.",i+1,") ~ \"4\",
TRUE ~ \"2\"
))")
eval(parse_expr(text.string))
}
# NA: missing in both cases
# 1: move into region
# 2: stayed in region
# 3: moved within region
# 4: moved out of region
Now the table looks like this:
id
municip.2009
municip.2010
municip.2011
move.2010
move.2011
1
2
1
3
3
3
2
1
1
3
2
3
3
NA
NA
NA
NA
NA
4
1
NA
3
4
1
5
1
NA
2
4
1
6
3
2
2
3
2
7
2
NA
3
4
1
8
3
NA
3
4
1
9
NA
1
NA
1
4
10
1
NA
2
4
1
What I want to do is to flip it back to create something like this:
id
year
municip
move
1
2009
2
NA
1
2010
1
3
1
2011
3
3
2
2009
1
NA
2
2010
1
2
2
2011
3
3
3
2009
NA
NA
3
2010
NA
NA
3
2011
NA
NA
I'm not sure if this can be done with just pivot_longer on it's own. I tried a couple of variations. Any ideas?
You can try this:
df <- tribble(~id, ~municip.2009, ~municip.2010, ~municip.2011, ~move.2010, ~move.2011,
1, 2, 1, 3, 3, 3,
2, 1, 1, 3, 2, 3,
3, NA, NA, NA, NA, NA,
4, 1, NA, 3, 4, 1,
5, 1, NA, 2, 4, 1,
6, 3, 2, 2, 3, 2,
7, 2, NA, 3, 4, 1,
8, 3, NA, 3, 4, 1,
9, NA, 1, NA, 1, 4,
10, 1, NA, 2, 4, 1
)
df %>%
pivot_longer(cols = -1, names_to = "temp1", values_to = "count") %>%
separate(col = temp1, c("temp2", "year")) %>%
pivot_wider(names_from = temp2, values_from = count)
pivot_longer collects municip and move in the same column; with separate split municip and move by the years; finally with pivot_wider you get the final result.
Don't think sideways, think longways!
Now, I cannot answer your question completly, because I don't really understand what you are calculating. Is it some sort of factor (1-4)? But I believe you can finish this yourself. Consider the following:
> mydata %>% group_by(id) %>%
arrange(year) %>%
mutate(last_year = lag(municip)) %>%
ungroup %>%
arrange(id) %>% as.data.frame # ignore this line, it is simply for the pleasure of seeing the data.frame
id year municip last_year
1 1 2009 3 NA
2 1 2010 2 3
3 1 2011 NA 2
4 2 2009 NA NA
5 2 2010 NA NA
6 2 2011 1 NA
7 3 2009 3 NA
8 3 2010 2 3
9 3 2011 2 2
10 4 2009 2 NA
11 4 2010 NA 2
12 4 2011 1 NA
13 5 2009 3 NA
14 5 2010 NA 3
15 5 2011 2 NA
16 6 2009 1 NA
17 6 2010 3 1
18 6 2011 2 3
19 7 2009 3 NA
20 7 2010 2 3
21 7 2011 2 2
22 8 2009 NA NA
23 8 2010 NA NA
24 8 2011 3 NA
25 9 2009 1 NA
26 9 2010 NA 1
27 9 2011 1 NA
28 10 2009 3 NA
29 10 2010 NA 3
30 10 2011 NA NA
You see? In long-form, you now can simply continue with
%>% mutate(move = case_when(
is.na(.$municip) & is.na(.$last_year) ~ \"NA\",
# etc.
))
Did you want the comparision from year i to the following year? Use the function lead instead of lag.
Lastly, your text-code might not work; when using case_when you have to refer to variables in the piped result with .$.
Something like this?
mydata.wide %>%
pivot_longer(
cols = -id,
names_pattern = "([a-z]+?)\\.(\\d+)",
names_to = c("name", "year"),
values_to = "val",
values_transform = list(val = as.character)
) %>%
pivot_wider(
names_from = name,
values_from = val
) %>%
print(n=30)
A tibble: 30 × 4
id year municip move
<int> <chr> <chr> <chr>
1 1 2009 2 NA
2 1 2010 3 3
3 1 2011 NA 4
4 2 2009 2 NA
5 2 2010 NA 4
6 2 2011 2 1
7 3 2009 1 NA
8 3 2010 2 3
9 3 2011 1 3
10 4 2009 NA NA
11 4 2010 NA NA
12 4 2011 1 1
13 5 2009 NA NA
14 5 2010 2 1
15 5 2011 3 3
16 6 2009 3 NA
17 6 2010 3 2
18 6 2011 3 2
19 7 2009 NA NA
20 7 2010 NA NA
21 7 2011 NA NA
22 8 2009 NA NA
23 8 2010 2 1
24 8 2011 NA 4
25 9 2009 3 NA
26 9 2010 2 3
27 9 2011 NA 4
28 10 2009 2 NA
29 10 2010 3 3
30 10 2011 1 3
So, I have a dataset that looks just like that :
site year territories cat
1 10 2017 0.0 1
2 10 2016 NA NA
3 10 2015 2.0 1
4 10 2014 NA NA
5 10 2013 NA NA
6 11 2012 NA NA
7 11 2011 0.0 2
8 11 2010 NA NA
9 11 2009 1.0 2
But I do not want to have NAs in the cat column. Instead, I want every line within the same site to get the same value of cat.
Just like this :
site year territories cat
1 10 2017 0.0 1
2 10 2016 NA 1
3 10 2015 2.0 1
4 10 2014 NA 1
5 10 2013 NA 1
6 11 2012 NA 2
7 11 2011 0.0 2
8 11 2010 NA 2
9 11 2009 1.0 2
Any idea on how I can do that?
Use na.aggregate to fill in the NA values using ave to do it by site.
library(zoo)
transform(DF, cat = ave(cat, site, FUN = na.aggregate))
giving:
site year territories cat
1 10 2017 0 1
2 10 2016 NA 1
3 10 2015 2 1
4 10 2014 NA 1
5 10 2013 NA 1
6 11 2012 NA 2
7 11 2011 0 2
8 11 2010 NA 2
9 11 2009 1 2
Note
The input used, in reproducible form, is:
Lines <- "
site year territories cat
1 10 2017 0.0 1
2 10 2016 NA NA
3 10 2015 2.0 1
4 10 2014 NA NA
5 10 2013 NA NA
6 11 2012 NA NA
7 11 2011 0.0 2
8 11 2010 NA NA
9 11 2009 1.0 2"
DF <- read.table(text = Lines)
A complete base R alternative:
transform(DF, cat = ave(cat, site, FUN = function(x) x[!is.na(x)][1]))
which gives:
site year territories cat
1 10 2017 0 1
2 10 2016 NA 1
3 10 2015 2 1
4 10 2014 NA 1
5 10 2013 NA 1
6 11 2012 NA 2
7 11 2011 0 2
8 11 2010 NA 2
9 11 2009 1 2
The same logic implemented with dplyr:
library(dplyr)
DF %>%
group_by(site) %>%
mutate(cat = na.omit(cat)[1])
Or with na.locf of the zoo-package:
library(zoo)
transform(DF, cat = ave(cat, site, FUN = function(x) na.locf(na.locf(x, fromLast = TRUE, na.rm = FALSE))))
Or with fill from tidyr:
library(tidyr)
library(dplyr)
DF %>%
group_by(site) %>%
fill(cat) %>%
fill(cat, .direction = "up")
NOTE: I'm wondered what the added value is of the cat-column when cat has to be the same for each site. You'll end up with two grouping variables that do exactly the same, thus making one ot them redundant imo.
You can also use tidyr::fill
library(dplyr)
library(tidyr)
DF %>%
group_by(site) %>%
fill(cat,.direction = "up") %>%
fill(cat,.direction = "down") %>%
ungroup
# # A tibble: 9 x 4
# site year territories cat
# <int> <int> <dbl> <int>
# 1 10 2017 0 1
# 2 10 2016 NA 1
# 3 10 2015 2 1
# 4 10 2014 NA 1
# 5 10 2013 NA 1
# 6 11 2012 NA 2
# 7 11 2011 0 2
# 8 11 2010 NA 2
# 9 11 2009 1 2
I reshape data using data.table.
library(data.table)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market
# stkcd type roa2013 roa2014 lev2013 lev2016
# 1: 1 1 2 4 6 8
# 2: 2 0 3 5 7 9
melt(market,
measure.vars = patterns("^roa", "^lev"),
variable.name = "year",
value.name = c("roa","lev"))
# stkcd type year roa lev
# 1: 1 1 1 2 6
# 2: 2 0 1 3 7
# 3: 1 1 2 4 8
# 4: 2 0 2 5 9
This is how the final data should look like.
# stkcd type year roa lev
# 1 1 1 2013 2 6
# 2 1 1 2014 4 NA
# 3 1 1 2016 NA 8
# 4 2 0 2013 3 7
# 5 2 0 2014 5 NA
# 6 2 0 2016 NA 9
Does anybody have any good ways for it?
Thanks.
We can do this easily with splitstackshape. Create a delimiter between the numeric and non-numeric part in the columns of interest, then use merged.stack to reshape into 'long' and change the '.time_1` column name to 'year'
library(splitstackshape)
names(market) <- sub("(\\d+)", "_\\1", names(market))
res <- merged.stack(market, var.stubs = c("roa", "lev"), sep="_")
setnames(res, ".time_1", "year")
res
# stkcd type year roa lev
#1: 1 1 2013 2 6
#2: 1 1 2014 4 NA
#3: 1 1 2016 NA 8
#4: 2 0 2013 3 7
#5: 2 0 2014 5 NA
#6: 2 0 2016 NA 9
1.use reshape {stats},
library(data.table)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market[,`:=`(roa2016=NA,lev2014=NA)]
long <- reshape(market,
idvar = "stkcd",
varying = c("roa2013","lev2013",
"roa2014","lev2014",
"roa2016","lev2016"),
sep = "",
timevar = "year",
direction = "long")
setorder(long,stkcd,year)
long
# stkcd type year roa lev
# 1: 1 1 2013 2 6
# 2: 1 1 2014 4 NA
# 3: 1 1 2016 NA 8
# 4: 2 0 2013 3 7
# 5: 2 0 2014 5 NA
# 6: 2 0 2016 NA 9
2.str_extract str
library(data.table)
library(stringr)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market
long <- melt(market,
id.vars = c("stkcd","type"))
long[,`:=`(year=str_extract(variable,pattern = "[0-9]{4}"),
vars=str_extract(variable,pattern = "[a-zA-Z]{1,}"))][,variable:=NULL]
long <- dcast(long, stkcd + type + year ~ vars, value.var = "value")
long
# stkcd type year lev roa
# 1: 1 1 2013 6 2
# 2: 1 1 2014 NA 4
# 3: 1 1 2016 8 NA
# 4: 2 0 2013 7 3
# 5: 2 0 2014 NA 5
# 6: 2 0 2016 9 NA
...