Subset a data set based on first non-NA values - r

I have a data frame like this:
Year S1 S2 S3
1699 1 NA NA
1700 5 23 5
1701 6 1 6
1702 7 13 9
I want to keep only those columns where the first non-NA year is equal or bigger than 1700. In this case, I want to keep columns S2 and S3 but not S1 (since its first non-NA year is 1699).
How can I do this?

You can use Filter :
result <- cbind(df1[1], Filter(function(x)
df1$Year[which.max(!is.na(x))] >= 1700, df1[-1]))
result
# Year S2 S3
#1 1699 NA NA
#2 1700 23 5
#3 1701 1 6
#4 1702 13 9

Using an sapply like this.
d[c(T, sapply(d[-1], function(x) d$Year[!is.na(x)][1]) >= 1700)]
# Year S2 S3
# 1 1699 NA NA
# 2 1700 23 5
# 3 1701 1 6
# 4 1702 13 9
Data
d <- read.table(header=TRUE, text="Year S1 S2 S3
1699 1 NA NA
1700 5 23 5
1701 6 1 6
1702 7 13 9")

Related

R spread dataframe [duplicate]

This question already has answers here:
Reshape multiple value columns to wide format
(5 answers)
Closed 2 years ago.
IN R language how to convert
data1 into data2
data1 = fread("
id year cost pf loss
A 2019-02 155 10 41
B 2019-03 165 14 22
B 2019-01 185 34 56
C 2019-02 350 50 0
A 2019-01 310 40 99")
data2 = fread("
id item 2019-01 2019-02 2019-03
A cost 30 155 NA
A pf 40 10 NA
A loss 99 41 NA
B cost 185 NA 160
B pf 34 NA 14
B loss 56 NA 22
C cost NA 350 NA
C pf NA 50 NA
C loss NA 0 NA")
I try to use spread、gather、dplyr、apply..... but .....
First get the data in long format and then get it back in wide.
library(tidyr)
data1 %>%
pivot_longer(cols = cost:loss) %>%
pivot_wider(names_from = year, values_from = value)
Note that gather and spread have been retired and replace by pivot_longer and pivot_wider.
Using data.table :
library(data.table)
dcast(melt(data1, c('id', 'year')), id+variable~year, value.var = 'value')
# id variable 2019-01 2019-02 2019-03
#1: A cost 310 155 NA
#2: A pf 40 10 NA
#3: A loss 99 41 NA
#4: B cost 185 NA 165
#5: B pf 34 NA 14
#6: B loss 56 NA 22
#7: C cost NA 350 NA
#8: C pf NA 50 NA
#9: C loss NA 0 NA

How to split a data set with duplicated informations based on date

I have this situation:
ID date Weight
1 2014-12-02 23
1 2014-10-02 25
2 2014-11-03 27
2 2014-09-03 45
3 2014-07-11 56
3 NA 34
4 2014-10-05 25
4 2014-08-09 14
5 NA NA
5 NA NA
And I would like split the dataset in this, like this:
1-
ID date Weight
1 2014-12-02 23
1 2014-10-02 25
2 2014-11-03 27
2 2014-09-03 45
4 2014-10-05 25
4 2014-08-09 14
2- Lowest Date
ID date Weight
3 2014-07-11 56
3 NA 34
5 NA NA
5 NA NA
I tried this for second dataset:
dt <- dt[order(dt$ID, dt$date), ]
dt.2=dt[duplicated(dt$ID), ]
but didn't work
Get the ID's for which date are NA and then subset based on that
NA_ids <- unique(df$ID[is.na(df$date)])
subset(df, !ID %in% NA_ids)
# ID date Weight
#1 1 2014-12-02 23
#2 1 2014-10-02 25
#3 2 2014-11-03 27
#4 2 2014-09-03 45
#7 4 2014-10-05 25
#8 4 2014-08-09 14
subset(df, ID %in% NA_ids)
# ID date Weight
#5 3 2014-07-11 56
#6 3 <NA> 34
#9 5 <NA> NA
#10 5 <NA> NA
Using dplyr, we can create a new column which has TRUE/FALSE for each ID based on presence of NA and then use group_split to split into list of two.
library(dplyr)
df %>%
group_by(ID) %>%
mutate(NA_ID = any(is.na(date))) %>%
ungroup %>%
group_split(NA_ID, keep = FALSE)
The above dplyr logic can also be implemented in base R by using ave and split
df$NA_ID <- with(df, ave(is.na(date), ID, FUN = any))
split(df[-4], df$NA_ID)

How can I drop observations within a group following the occurrence of NA?

I am trying to clean my data. One of the criteria is that I need an uninterrupted sequence of a variable "assets", but I have some NAs. However, I cannot simply delete the NA observations, but need to delete all subsequent observations following the NA event.
Here an example:
productreference<-c(1,1,1,1,2,2,2,3,3,3,3,4,4,4,5,5,5,5)
Year<-c(2000,2001,2002,2003,1999,2000,2001,2005,2006,2007,2008,1998,1999,2000,2000,2001,2002,2003)
assets<-c(2,3,NA,2,34,NA,45,1,23,34,56,56,67,23,23,NA,14,NA)
mydf<-data.frame(productreference,Year,assets)
mydf
# productreference Year assets
# 1 1 2000 2
# 2 1 2001 3
# 3 1 2002 NA
# 4 1 2003 2
# 5 2 1999 34
# 6 2 2000 NA
# 7 2 2001 45
# 8 3 2005 1
# 9 3 2006 23
# 10 3 2007 34
# 11 3 2008 56
# 12 4 1998 56
# 13 4 1999 67
# 14 4 2000 23
# 15 5 2000 23
# 16 5 2001 NA
# 17 5 2002 14
# 18 5 2003 NA
I have already seen that there is a way to carry out functions by group using plyr and I have also been able to create a column with 0-1, where 0 indicates that assets has a valid entry and 1 highlights missing values of NA.
mydf$missing<-ifelse(mydf$assets>=0,0,1)
mydf[c("missing")][is.na(mydf[c("missing")])] <- 1
I have a very large data set so cannot manually delete the rows and would greatly appreciate your help!
I believe this is what you want:
library(dplyr)
group_by(mydf, productreference) %>%
filter(cumsum(is.na(assets)) == 0)
# Source: local data frame [11 x 3]
# Groups: productreference [5]
#
# productreference Year assets
# (dbl) (dbl) (dbl)
# 1 1 2000 2
# 2 1 2001 3
# 3 2 1999 34
# 4 3 2005 1
# 5 3 2006 23
# 6 3 2007 34
# 7 3 2008 56
# 8 4 1998 56
# 9 4 1999 67
# 10 4 2000 23
# 11 5 2000 23
Here is the same approach using data.table:
library(data.table)
dt <- as.data.table(mydf)
dt[,nas:= cumsum(is.na(assets)),by="productreference"][nas==0]
# productreference Year assets nas
# 1: 1 2000 2 0
# 2: 1 2001 3 0
# 3: 2 1999 34 0
# 4: 3 2005 1 0
# 5: 3 2006 23 0
# 6: 3 2007 34 0
# 7: 3 2008 56 0
# 8: 4 1998 56 0
# 9: 4 1999 67 0
#10: 4 2000 23 0
#11: 5 2000 23 0
Here is a base R option
mydf[unsplit(lapply(split(mydf, mydf$productreference),
function(x) cumsum(is.na(x$assets))==0), mydf$productreference),]
# productreference Year assets
#1 1 2000 2
#2 1 2001 3
#5 2 1999 34
#8 3 2005 1
#9 3 2006 23
#10 3 2007 34
#11 3 2008 56
#12 4 1998 56
#13 4 1999 67
#14 4 2000 23
#15 5 2000 23
Or an option with data.table
library(data.table)
setDT(mydf)[, if(any(is.na(assets))) .SD[seq(which(is.na(assets))[1]-1)]
else .SD, by = productreference]
You can do it using base R and a for loop. This code is a bit longer than some of the code in the other answers. In the loop we subset mydf by productreference and for every subset we look for the first occurrence of assets==NA, and exclude that row and all following rows.
mydf2 <- NULL
for (i in 1:max(mydf$productreference)){
s1 <- mydf[mydf$productreference==i,]
s2 <- s1[1:ifelse(all(!is.na(s1$assets)), NROW(s1), min(which(is.na(s1$assets)==T))-1),]
mydf2 <- rbind(mydf2, s2)
mydf2 <- mydf2[!is.na(mydf2$assets),]
}
mydf2

R - Calculate Time Elapsed Since Last Event with Multiple Event Types

I have a dataframe that contains the dates of multiple types of events.
df <- data.frame(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000"
,"03/01/2001","17/03/2001","23/04/2001",
"26/05/2001","01/06/2001",
"30/06/2001","02/07/2001","15/07/2001"
,"21/12/2001"), "%d/%m/%Y"),
event_type=c(0,4,1,2,4,1,0,2,3,3,4,3))
date event_type
---------------- ----------
1 2000-07-06 0
2 2000-09-15 4
3 2000-10-15 1
4 2001-01-03 2
5 2001-03-17 4
6 2001-04-23 1
7 2001-05-26 0
8 2001-06-01 2
9 2001-06-30 3
10 2001-07-02 3
11 2001-07-15 4
12 2001-12-21 3
I am trying to calculate the days between each event type so the output looks like the below:
date event_type days_since_last_event
---------------- ---------- ---------------------
1 2000-07-06 0 NA
2 2000-09-15 4 NA
3 2000-10-15 1 NA
4 2001-01-03 2 NA
5 2001-03-17 4 183
6 2001-04-23 1 190
7 2001-05-26 0 324
8 2001-06-01 2 149
9 2001-06-30 3 NA
10 2001-07-02 3 2
11 2001-07-15 4 120
12 2001-12-21 3 172
I have benefited from the answers from these two previous posts but have not been able to address my specific problem in R; multiple event types.
Calculate elapsed time since last event
Calculate days since last event in R
Below is as far as I have gotten. I have not been able to leverage the last event index to calculate the last event date.
df <- cbind(df, as.vector(data.frame(count=ave(df$event_type==df$event_type,
df$event_type, FUN=cumsum))))
df <- rename(df, c("count" = "last_event_index"))
date event_type last_event_index
--------------- ------------- ----------------
1 2000-07-06 0 1
2 2000-09-15 4 1
3 2000-10-15 1 1
4 2001-01-03 2 1
5 2001-03-17 4 2
6 2001-04-23 1 2
7 2001-05-26 0 2
8 2001-06-01 2 2
9 2001-06-30 3 1
10 2001-07-02 3 2
11 2001-07-15 4 3
12 2001-12-21 3 3
We can use diff to get the difference between adjacent 'date' after grouping by 'event_type'. Here, I am using data.table approach by converting the 'data.frame' to 'data.table' (setDT(df)), grouped by 'event_type', we get the diff of 'date'.
library(data.table)
setDT(df)[,days_since_last_event :=c(NA,diff(date)) , by = event_type]
df
# date event_type days_since_last_event
# 1: 2000-07-06 0 NA
# 2: 2000-09-15 4 NA
# 3: 2000-10-15 1 NA
# 4: 2001-01-03 2 NA
# 5: 2001-03-17 4 183
# 6: 2001-04-23 1 190
# 7: 2001-05-26 0 324
# 8: 2001-06-01 2 149
# 9: 2001-06-30 3 NA
#10: 2001-07-02 3 2
#11: 2001-07-15 4 120
#12: 2001-12-21 3 172
Or as #Frank mentioned in the comments, we can also use shift (from version v1.9.5+ onwards) to get the lag (by default, the type='lag') of 'date' and subtract from the 'date'.
setDT(df)[, days_since_last_event := as.numeric(date-shift(date,type="lag")),
by = event_type]
The base R version of this is to use split/lapply/rbind to generate the new column.
> do.call(rbind,
lapply(
split(df, df$event_type),
function(d) {
d$dsle <- c(NA, diff(d$date)); d
}
)
)
date event_type dsle
0.1 2000-07-06 0 NA
0.7 2001-05-26 0 324
1.3 2000-10-15 1 NA
1.6 2001-04-23 1 190
2.4 2001-01-03 2 NA
2.8 2001-06-01 2 149
3.9 2001-06-30 3 NA
3.10 2001-07-02 3 2
3.12 2001-12-21 3 172
4.2 2000-09-15 4 NA
4.5 2001-03-17 4 183
4.11 2001-07-15 4 120
Note that this returns the data in a different order than provided; you can re-sort by date or save the original indices if you want to preserve that order.
Above, #akrun has posted the data.tables approach, the parallel dplyr approach would be straightforward as well:
library(dplyr)
df %>% group_by(event_type) %>% mutate(days_since_last_event=date - lag(date, 1))
Source: local data frame [12 x 3]
Groups: event_type [5]
date event_type days_since_last_event
(date) (dbl) (dfft)
1 2000-07-06 0 NA days
2 2000-09-15 4 NA days
3 2000-10-15 1 NA days
4 2001-01-03 2 NA days
5 2001-03-17 4 183 days
6 2001-04-23 1 190 days
7 2001-05-26 0 324 days
8 2001-06-01 2 149 days
9 2001-06-30 3 NA days
10 2001-07-02 3 2 days
11 2001-07-15 4 120 days
12 2001-12-21 3 172 days

Removing rows of data frame if number of NA in a column is larger than 3

I have a data frame (panel data): Ctry column indicates the name of countries in my data frame. In any column (for example: Carx) if number of NAs is larger 3; I want to drop the related country in my data fame. For example,
Country A has 2 NA
Country B has 4 NA
Country C has 3 NA
I want to drop country B in my data frame. I have a data frame like this (This is for illustration, my data frame is actually very huge):
Ctry year Carx
A 2000 23
A 2001 18
A 2002 20
A 2003 NA
A 2004 24
A 2005 18
B 2000 NA
B 2001 NA
B 2002 NA
B 2003 NA
B 2004 18
B 2005 16
C 2000 NA
C 2001 NA
C 2002 24
C 2003 21
C 2004 NA
C 2005 24
I want to create a data frame like this:
Ctry year Carx
A 2000 23
A 2001 18
A 2002 20
A 2003 NA
A 2004 24
A 2005 18
C 2000 NA
C 2001 NA
C 2002 24
C 2003 21
C 2004 NA
C 2005 24
A fairly straightforward way in base R is to use sum(is.na(.)) along with ave, to do the counting, like this:
with(mydf, ave(Carx, Ctry, FUN = function(x) sum(is.na(x))))
# [1] 1 1 1 1 1 1 4 4 4 4 4 4 3 3 3 3 3 3
Once you have that, subsetting is easy:
mydf[with(mydf, ave(Carx, Ctry, FUN = function(x) sum(is.na(x)))) <= 3, ]
# Ctry year Carx
# 1 A 2000 23
# 2 A 2001 18
# 3 A 2002 20
# 4 A 2003 NA
# 5 A 2004 24
# 6 A 2005 18
# 13 C 2000 NA
# 14 C 2001 NA
# 15 C 2002 24
# 16 C 2003 21
# 17 C 2004 NA
# 18 C 2005 24
You can use by() function to group by Ctry and count NA's of each group :
DF <- read.csv(
text='Ctry,year,Carx
A,2000,23
A,2001,18
A,2002,20
A,2003,NA
A,2004,24
A,2005,18
B,2000,NA
B,2001,NA
B,2002,NA
B,2003,NA
B,2004,18
B,2005,16
C,2000,NA
C,2001,NA
C,2002,24
C,2003,21
C,2004,NA
C,2005,24',
stringsAsFactors=F)
res <- by(data=DF$Carx,INDICES=DF$Ctry,FUN=function(x)sum(is.na(x)))
validCtry <-names(res)[res <= 3]
DF[DF$Ctry %in% validCtry, ]
# Ctry year Carx
#1 A 2000 23
#2 A 2001 18
#3 A 2002 20
#4 A 2003 NA
#5 A 2004 24
#6 A 2005 18
#13 C 2000 NA
#14 C 2001 NA
#15 C 2002 24
#16 C 2003 21
#17 C 2004 NA
#18 C 2005 24
EDIT :
if you have more columns to check, you could adapt the previous code as follows:
res <- by(data=DF,INDICES=DF$Ctry,
FUN=function(x){
return(sum(is.na(x$Carx)) <= 3 &&
sum(is.na(x$Barx)) <= 3 &&
sum(is.na(x$Tarx)) <= 3)
})
validCtry <- names(res)[res]
DF[DF$Ctry %in% validCtry, ]
where, of course, you may change the condition in FUN according to your needs.
Since you mention that you data is "very huge" (whatever that means exactly), you could try a solution with dplyr and see if it's perhaps faster than the solutions in base R. If the other solutions are fast enough, just ignore this one.
require(dplyr)
newdf <- df %.% group_by(Ctry) %.% filter(sum(is.na(Carx)) <= 3)

Resources