R data.table reshape data - r

I reshape data using data.table.
library(data.table)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market
# stkcd type roa2013 roa2014 lev2013 lev2016
# 1: 1 1 2 4 6 8
# 2: 2 0 3 5 7 9
melt(market,
measure.vars = patterns("^roa", "^lev"),
variable.name = "year",
value.name = c("roa","lev"))
# stkcd type year roa lev
# 1: 1 1 1 2 6
# 2: 2 0 1 3 7
# 3: 1 1 2 4 8
# 4: 2 0 2 5 9
This is how the final data should look like.
# stkcd type year roa lev
# 1 1 1 2013 2 6
# 2 1 1 2014 4 NA
# 3 1 1 2016 NA 8
# 4 2 0 2013 3 7
# 5 2 0 2014 5 NA
# 6 2 0 2016 NA 9
Does anybody have any good ways for it?
Thanks.

We can do this easily with splitstackshape. Create a delimiter between the numeric and non-numeric part in the columns of interest, then use merged.stack to reshape into 'long' and change the '.time_1` column name to 'year'
library(splitstackshape)
names(market) <- sub("(\\d+)", "_\\1", names(market))
res <- merged.stack(market, var.stubs = c("roa", "lev"), sep="_")
setnames(res, ".time_1", "year")
res
# stkcd type year roa lev
#1: 1 1 2013 2 6
#2: 1 1 2014 4 NA
#3: 1 1 2016 NA 8
#4: 2 0 2013 3 7
#5: 2 0 2014 5 NA
#6: 2 0 2016 NA 9

1.use reshape {stats},
library(data.table)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market[,`:=`(roa2016=NA,lev2014=NA)]
long <- reshape(market,
idvar = "stkcd",
varying = c("roa2013","lev2013",
"roa2014","lev2014",
"roa2016","lev2016"),
sep = "",
timevar = "year",
direction = "long")
setorder(long,stkcd,year)
long
# stkcd type year roa lev
# 1: 1 1 2013 2 6
# 2: 1 1 2014 4 NA
# 3: 1 1 2016 NA 8
# 4: 2 0 2013 3 7
# 5: 2 0 2014 5 NA
# 6: 2 0 2016 NA 9
2.str_extract str
library(data.table)
library(stringr)
market <- data.table(
stkcd=c(1,2),
type =c(1,0),
roa2013=c(2,3),
roa2014=c(4,5),
lev2013=c(6,7),
lev2016=c(8,9))
market
long <- melt(market,
id.vars = c("stkcd","type"))
long[,`:=`(year=str_extract(variable,pattern = "[0-9]{4}"),
vars=str_extract(variable,pattern = "[a-zA-Z]{1,}"))][,variable:=NULL]
long <- dcast(long, stkcd + type + year ~ vars, value.var = "value")
long
# stkcd type year lev roa
# 1: 1 1 2013 6 2
# 2: 1 1 2014 NA 4
# 3: 1 1 2016 8 NA
# 4: 2 0 2013 7 3
# 5: 2 0 2014 NA 5
# 6: 2 0 2016 9 NA
...

Related

Remove NAs in each column by group

I have a dataframe with rows grouped by Year. Variables don't always have observations in each year but when they do, there are 3 observations in that year but appear in different rows.
> na_data
Year Peter Paul John
1 2011 1 NA NA
2 2011 2 NA NA
3 2011 3 NA NA
4 2011 NA 1 NA
5 2011 NA 2 NA
6 2011 NA 3 NA
7 2012 1 NA NA
8 2012 NA 3 NA
9 2012 2 NA NA
10 2012 NA 2 NA
11 2012 3 NA NA
12 2012 NA 1 NA
13 2013 NA 1 4
14 2013 NA 2 5
15 2013 NA 3 6
16 2013 1 NA NA
17 2013 2 NA NA
18 2013 3 NA NA
I want to remove the NAs in each column by group. Such that the output looks like this:
final_data
Year Peter Paul John
[1,] 2011 1 1 NA
[2,] 2011 2 2 NA
[3,] 2011 3 3 NA
[4,] 2012 1 3 NA
[5,] 2012 2 2 NA
[6,] 2012 3 1 NA
[7,] 2013 1 1 4
[8,] 2013 2 2 5
[9,] 2013 3 3 6
So far I have used a loop but I am looking for a cleaner solution if anyone can help that would be great. My solution:
cleaned_list <- vector("list", length(unique(full_data$Year)))
names(cleaned_list) <- unique(full_data$Year)
for(yr in unique(na_data$Year)) {
temp <- matrix(NA, nrow = 3, ncol = ncol(na_data),
dimnames = list(NULL, colnames(na_data)))
for(name in colnames(na_data)[-1]){
no_nas <- as.vector(na.omit(na_data[Year==yr, name]))
if (length(no_nas)!=0) temp[,name] <- no_nas
}
temp[,1] <- yr
cleaned_list[[as.character(yr)]] <- temp
}
final_data <- do.call("rbind", cleaned_list)
Data:
na_data <- data.frame(
Year = rep(c(2011,2012,2013), each = 6),
Peter = c(1:3, rep(NA, 3), 1,NA,2,NA,3,NA, rep(NA, 3),1:3),
Paul = c(rep(NA,3), 1:3, NA,3,NA,2,NA, 1, 1:3, rep(NA,3)),
John = c(rep(NA, 12), 4:6, rep(NA, 3))
)
desired <- data.frame(
Year = rep(c(2011,2012,2013), each = 3),
Peter = c(1:3, 1:3, 1:3),
Paul = c( 1:3, 3:1, 1:3),
John = c(rep(NA, 6), 4:6)
) # same as final_data but a dataframe
Here is one possible solution using data.table package:
library(data.table)
setDT(na_data)[, lapply(.SD, function(x) if(length(y<-na.omit(x))) y else first(x)), by=Year]
# Year Peter Paul John
# 1: 2011 1 1 NA
# 2: 2011 2 2 NA
# 3: 2011 3 3 NA
# 4: 2012 1 3 NA
# 5: 2012 2 2 NA
# 6: 2012 3 1 NA
# 7: 2013 1 1 4
# 8: 2013 2 2 5
# 9: 2013 3 3 6
dplyr equivalent:
library(dplyr)
na_data |>
group_by(Year) |>
summarise(across(.fns = ~ if(length(y<-na.omit(.x))) y else first(.x)))
# # A tibble: 9 x 4
# # Groups: Year [3]
# Year Peter Paul John
# <dbl> <dbl> <dbl> <int>
# 1 2011 1 1 NA
# 2 2011 2 2 NA
# 3 2011 3 3 NA
# 4 2012 1 3 NA
# 5 2012 2 2 NA
# 6 2012 3 1 NA
# 7 2013 1 1 4
# 8 2013 2 2 5
# 9 2013 3 3 6
Convert to long form, remove the NA's, add a sequence number n, convert back and remove n.
library(dplyr)
library(tidyr)
na_data %>%
pivot_longer(-Year) %>%
drop_na %>%
group_by(Year, name) %>%
mutate(n = 1:n()) %>%
ungroup %>%
pivot_wider %>%
select(-n)
giving:
# A tibble: 9 x 4
Year Paul Peter John
<dbl> <dbl> <dbl> <dbl>
1 2011 1 1 NA
2 2011 2 2 NA
3 2011 3 3 NA
4 2012 1 1 NA
5 2012 2 2 NA
6 2012 3 3 NA
7 2013 1 1 4
8 2013 2 2 5
9 2013 3 3 6

Tracking the first incidence of each episode

I am currently using R to process a data set that looks like the following:
age ep
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 0
9 1
10 1
11 0
I want to create a variable that will keep track of the first occurrence of ep=1 per series of ep=1. These series will have ep=0 prior to the first ep=1 and ep=0 following the last ep=1 of each series.
I would like the data set to look like this after processing:
age ep first
1 0 NA
2 0 NA
3 1 1
4 1 NA
5 1 NA
6 1 NA
7 0 NA
8 0 NA
9 1 1
10 1 NA
11 0 NA
I am working in data table as this data set is rather large, so I'd prefer to process the data using code for data tables, however if this isn't possible I can convert to a data frame and use other code. Any assistance would be greatly appreciated.
A fast data.table method ...
library(data.table)
dt <- fread("age ep
1 0
2 0
3 1
4 1
5 1
6 1
7 0
8 0
9 1
10 1
11 0")
dt[!shift(ep) & ep, first := 1]
# or more explicit:
dt[shift(ep) != 1 & ep == 1, first := 1]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
# 10: 10 1 NA
# 11: 11 0 NA
Note: just for clarity, if your object is not already a data.table. You can coerce it to a data.table:
setDT(dt)
Another option using an update join
dt[, first := dt[dt[, .I[1], by=rleid(ep)]$V1][ep == 1][dt, on=.(age), ep]]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
#10: 10 1 NA
#11: 11 0 NA
Using data provided by #Khaynes
An approach using fifelse
dt[, first := fifelse( ep == 1 & shift( ep , type = "lag" ) == 0L, 1L, NA_integer_) ]
dt
# age ep first
# 1: 1 0 NA
# 2: 2 0 NA
# 3: 3 1 1
# 4: 4 1 NA
# 5: 5 1 NA
# 6: 6 1 NA
# 7: 7 0 NA
# 8: 8 0 NA
# 9: 9 1 1
# 10: 10 1 NA
# 11: 11 0 NA
Another update join version, using mult="first" to only overwrite the first matching row in the group:
dt[, rid := rleid(ep)][dt[ep==1], on=.(rid), mult="first", first := 1]
dt
# age ep rid first
# 1: 1 0 1 NA
# 2: 2 0 1 NA
# 3: 3 1 2 1
# 4: 4 1 2 NA
# 5: 5 1 2 NA
# 6: 6 1 2 NA
# 7: 7 0 3 NA
# 8: 8 0 3 NA
# 9: 9 1 4 1
#10: 10 1 4 NA
#11: 11 0 5 NA

Subsetting panel observations

I have a data.table with firm information.
library(data.table)
DT <- fread("
iso Firm GDP year
A 1 1 1
A 2 1 1
A 3 1 1
A 4 1 1
A 5 3 2
A 6 3 2
A 7 3 2
A 8 3 2
B 9 2 1
B 10 2 1
B 11 2 1
B 12 2 1
B 13 4 1
B 14 4 1
B 15 4 1
B 16 4 1",
header = TRUE)
I want to calculate GDPgrowth (per country) from one year to the other and add it to the dataset ((N-O)/O). However, if I do:
DT <- DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
the outcome will be zero because it subtracts the firm observations from each other.
How can I make sure it calculates for the whole group of firms belonging to the country together?
Desired output:
library(data.table)
DT <- fread("
iso Firm GDP GDPgrowth year
A 1 1 NA 1
A 2 1 NA 1
A 3 1 NA 1
A 4 1 NA 1
A 5 3 2 2
A 6 3 2 2
A 7 3 2 2
A 8 3 2 2
B 9 2 NA 1
B 10 2 NA 1
B 11 2 NA 1
B 12 2 NA 1
B 13 4 1 1
B 14 4 1 1
B 15 4 1 1
B 16 4 1 1",
header = TRUE)
Here is one way continuing from your current approach :
library(data.table)
DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
DT[GDPgrowth == 0, GDPgrowth := NA]
DT[, GDPgrowth:= zoo::na.locf(GDPgrowth, na.rm = FALSE), .(iso, year)]
DT
# iso Firm GDP year GDPgrowth
# 1: A 1 1 1 NA
# 2: A 2 1 1 NA
# 3: A 3 1 1 NA
# 4: A 4 1 1 NA
# 5: A 5 3 2 2
# 6: A 6 3 2 2
# 7: A 7 3 2 2
# 8: A 8 3 2 2
# 9: B 9 2 1 NA
#10: B 10 2 1 NA
#11: B 11 2 1 NA
#12: B 12 2 1 NA
#13: B 13 4 1 1
#14: B 14 4 1 1
#15: B 15 4 1 1
#16: B 16 4 1 1
Using dplyr and tidyr::fill it can be done as
library(dplyr)
DT %>%
group_by(iso) %>%
mutate(GDPgrowth = (GDP - lag(GDP))/lag(GDP),
GDPgrowth = replace(GDPgrowth, GDPgrowth == 0, NA)) %>%
group_by(iso, year) %>%
tidyr::fill(GDPgrowth)

When 0 in x is odd, how to assign id value between this zero and the next zero to the new variable ref

x<-c(0,0,1,1,0,1,1,1,0,1,1,0,1,1)
aaa<-data.frame(x)
aaa$id<-1:nrow(aaa)
When 0 in x is odd, how to assign id value between this zero and the next zero to the new variable ref.
The results like:
aaa$ref <- with(aaa, ifelse(cumsum(x == 0) %% 2, id, NA))
aaa
# x id ref
# 1 0 1 1
# 2 0 2 NA
# 3 1 3 NA
# 4 1 4 NA
# 5 0 5 5
# 6 1 6 6
# 7 1 7 7
# 8 1 8 8
# 9 0 9 NA
# 10 1 10 NA
# 11 1 11 NA
# 12 0 12 12
# 13 1 13 13
# 14 1 14 14
An option using data.table
library(data.table)
i1 <- setDT(aaa)[, grp := rleid(x)][, .I[seq_len(.N) == .N & x==0], grp]$V1
i2 <- unlist(lapply(split(i1, as.integer(gl(length(i1), 2,
length(i1)))), function(x) head(x[1]:x[2],-1)))
aaa[!i2, ref := id][, grp := NULL][]
# x id ref
# 1: 0 1 1
# 2: 0 2 NA
# 3: 1 3 NA
# 4: 1 4 NA
# 5: 0 5 5
# 6: 1 6 6
# 7: 1 7 7
# 8: 1 8 8
# 9: 0 9 NA
#10: 1 10 NA
#11: 1 11 NA
#12: 0 12 12
#13: 1 13 13
#14: 1 14 14

Calculate diff price in a unbalanced set

I have a unbalanced data frame with date, localities and prices. I would like calculate diff price among diferents localities by date. My data its unbalanced and to get all diff price I think in create data(localities) to balance data.
My data look like:
library(dplyr)
set.seed(123)
df= data.frame(date=(1:3),
locality= rbinom(21,3, 0.2),
price=rnorm(21, 50, 20))
df %>%
arrange(date, locality)
> date locality price
1 1 0 60.07625
2 1 0 35.32994
3 1 0 63.69872
4 1 1 54.76426
5 1 1 66.51080
6 1 1 28.28602
7 1 2 47.09213
8 2 0 26.68910
9 2 1 100.56673
10 2 1 48.88628
11 2 1 48.29153
12 2 2 29.02214
13 2 2 45.68269
14 2 2 43.59887
15 3 0 60.98193
16 3 0 75.89527
17 3 0 43.30174
18 3 0 71.41221
19 3 0 33.62969
20 3 1 34.31236
21 3 1 23.76955
To get balanced data I think in:
> date locality price
1 1 0 60.07625
2 1 0 35.32994
3 1 0 63.69872
4 1 1 54.76426
5 1 1 66.51080
6 1 1 28.28602
7 1 2 47.09213
8 1 2 NA
9 1 2 NA
10 2 0 26.68910
10 2 0 NA
10 2 0 NA
11 2 1 100.56673
12 2 1 48.88628
13 2 1 48.29153
14 2 2 29.02214
15 2 2 45.68269
16 2 2 43.59887
etc...
Finally to get diff price beetwen pair localities I think:
> date diff(price, 0-1) diff(price, 0-2) diff(price, 1-2)
1 1 60.07625-54.76426 60.07625-47.09213 etc...
2 1 35.32994-66.51080 35.32994-NA
3 1 63.69872-28.28602 63.69872-NA
You don't need to balance your data. If you use dcast, it will add the NAs for you.
First transform the data to show individual columns for each locality
library(data.table)
library(tidyverse)
setDT(df)
df[, rid := rowid(date, locality)]
df2 <- dcast(df, rid + date ~ locality, value.var = 'price')
# rid date 0 1 2
# 1: 1 1 60.07625 54.76426 47.09213
# 2: 1 2 26.68910 100.56673 29.02214
# 3: 1 3 60.98193 34.31236 NA
# 4: 2 1 35.32994 66.51080 NA
# 5: 2 2 NA 48.88628 45.68269
# 6: 2 3 75.89527 23.76955 NA
# 7: 3 1 63.69872 28.28602 NA
# 8: 3 2 NA 48.29153 43.59887
# 9: 3 3 43.30174 NA NA
# 10: 4 3 71.41221 NA NA
# 11: 5 3 33.62969 NA NA
Then create a data frame to_diff of differences to calculate, and pmap over that to calculate the differences. Here c0_1 corresponds to what you call in your question diff(price, 0-1).
to_diff <- CJ(0:2, 0:2)[V1 < V2]
pmap(to_diff, ~ df2[[as.character(.x)]] - df2[[as.character(.y)]]) %>%
setNames(paste0('c', to_diff[[1]], '_', to_diff[[2]])) %>%
bind_cols(df2[, 1:2])
# A tibble: 11 x 5
# c0_1 c0_2 c1_2 rid date
# <dbl> <dbl> <dbl> <int> <int>
# 1 5.31 13.0 7.67 1 1
# 2 -73.9 -2.33 71.5 1 2
# 3 26.7 NA NA 1 3
# 4 -31.2 NA NA 2 1
# 5 NA NA 3.20 2 2
# 6 52.1 NA NA 2 3
# 7 35.4 NA NA 3 1
# 8 NA NA 4.69 3 2
# 9 NA NA NA 3 3
# 10 NA NA NA 4 3
# 11 NA NA NA 5 3

Resources