Grouping with condition in RStudio - r

Good morning everyone, I have a csv file (df2.csv) with several variables, as illustrated below (just for example):
CLASSE Variables Terms Number
1 DAT_1 20160701q 5
1 DAT_1 20160802q 2
1 DAT_1 20160901q 1
1 DAT_2 20161001q 1
1 DAT_2 20161201q 2
1 DAT_2 20170301q 3
2 DAT_1 20161001q 1
2 DAT_1 20161201q 2
2 DAT_1 20170301q 1
I want for each class (1 or 2 in this case), for each distinct date variable, if the number of individuals is less than 3, to group individuals with the next date. If I have a period of more than 3 individuals, in this case, I want to have a date like '20160701q-20160901q' instead of 20160701q and 20160901q separately. In this case, we group two dates or more to get a period of more than 3 individuals, and if the next date of the class has less than 3 individuals, we will group this date with the period before also.
I started whit this code
for (n in df2$CLASSE){
for (k in df2$Variables){
for (i in 1:nrow(df2)){
if (df2$Number[i]<3){
rempl_date=paste(df2$Terms[i],df2$Terms[i+1], sep="-")
df2$Terms[i]<-rempl_date
next
}
}
}
}
But it doesn't work, I want to have this one after grouping:
CLASSE Variables Terms Number
1 DAT_1 20160701q 5
1 DAT_1 20160802q-20160901q 3
1 DAT_2 20161001q-20161201q 3
1 DAT_2 20170301q 3
2 DAT_1 20161001q-20170301q 4
I don't know what I must change else if you can help me, I hope I was clear. Thanks in advance

We can use MESS::cumsumbinning function here to create groups until a threshold is reached.
library(dplyr)
thresh <- 3
temp <- df %>%
group_by(CLASSE, Variables,
group = MESS::cumsumbinning(Number, thresh)) %>%
summarise(Terms = if(n() > 1)
paste(first(Terms), last(Terms), sep = "-") else Terms,
Number = sum(Number)) %>%
select(-group)
This returns :
temp
# A tibble: 6 x 4
# Groups: CLASSE, Variables [3]
# CLASSE Variables Terms Number
# <int> <chr> <chr> <int>
#1 1 DAT_1 20160701q 5
#2 1 DAT_1 20160802q-20160901q 3
#3 1 DAT_2 20161001q-20161201q 3
#4 1 DAT_2 20170301q 3
#5 2 DAT_1 20161001q-20161201q 3
#6 2 DAT_1 20170301q 1
To combine the last row, we can do :
n <- nrow(temp)
if(temp$Number[n] < 3) {
temp$Terms[n-1] <- sub("-.*", paste0('-', temp$Terms[n]), temp$Terms[n -1])
temp$Number[n-1] <- sum(temp$Number[n-1], temp$Number[n])
temp <- temp[-n,]
}
# CLASSE Variables Terms Number
# <int> <chr> <chr> <int>
#1 1 DAT_1 20160701q 5
#2 1 DAT_1 20160802q-20160901q 3
#3 1 DAT_2 20161001q-20161201q 3
#4 1 DAT_2 20170301q 3
#5 2 DAT_1 20161001q-20170301q 4

Here is a base R solution:
define custom function for grouping
f <- function(v, th = 3) {
k <- 1
r <- c()
repeat {
if (length(v)==0) break
ind<-seq(head(which(cumsum(v)>=th),1))
if (sum(v)<2*th) {
r <- c(r,rep(k,length(v)))
v <- c()
} else {
r <- c(r,rep(k,length(ind)))
v <- v[-ind]
}
k <- k+1
}
r
}
then use aggregate + ave
dfout <- subset(aggregate(Terms~.,
within(within(df,grp <- ave(Number,Classe, Variables, FUN = f)),
Number <- ave(Number,Classe,Variables,grp,FUN = sum)),
c),
select = -grp)
format the dfout to the desired style by using order
dfout <- dfout[order(dfout$Classe,dfout$Variables),]
Output
> dfout
Classe Variables Number Terms
3 1 DAT_1 5 20160701q
4 1 DAT_1 3 20160802q, 20160901q
1 1 DAT_2 3 20161001q, 20161201q
5 1 DAT_2 3 20170301q
2 2 DAT_1 4 20161001q, 20161201q, 20170301q
DATA
df <- structure(list(Classe = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L),
Variables = c("DAT_1", "DAT_1", "DAT_1", "DAT_2", "DAT_2",
"DAT_2", "DAT_1", "DAT_1", "DAT_1"), Terms = c("20160701q",
"20160802q", "20160901q", "20161001q", "20161201q", "20170301q",
"20161001q", "20161201q", "20170301q"), Number = c(5L, 2L,
1L, 1L, 2L, 3L, 1L, 2L, 1L)), class = "data.frame", row.names = c(NA,
-9L))
UPDATE
If you want to concatenate the contents in Terms, try the code below
dfout <- subset(aggregate(Terms~.,
within(within(df,grp <- ave(Number,Classe, Variables, FUN = f)),
Number <- ave(Number,Classe,Variables,grp,FUN = sum)),
FUN = function(v) ifelse(length(v)==1,v,paste0(c(v[1],v[length(v)]),collapse = "-"))),
select = -grp)
dfout <- dfout[order(dfout$Classe,dfout$Variables),]
such that
> dfout
Classe Variables Number Terms
3 1 DAT_1 5 20160701q
4 1 DAT_1 3 20160802q-20160901q
1 1 DAT_2 3 20161001q-20161201q
5 1 DAT_2 3 20170301q
2 2 DAT_1 4 20161001q-20170301q

This is a quite cumbersome solution I've created that does what you asked. I'm sure it can be optimized or that functions from other packages can be used.
Explanations are inserted into the code
# new dataframe
df_new <- data.frame(
CLASSE = numeric(nrow(df2)),
Variables = character(nrow(df2)),
Terms = character(nrow(df2)),
Number = numeric(nrow(df2)),
stringsAsFactors = FALSE
)
# temporary dataframe
temp_df <- data.frame(
CLASSE = numeric(0),
Variables = character(0),
Terms = character(0),
Number = numeric(0),
stringsAsFactors = FALSE
)
temp_sum <- 0
present_row_temp_df <- 1
for (i in 1:nrow(df2)){
# if the row doesn't have to be grouped, just paste it in the new dataframe
if (df2$Number[i] >= 3){
df_new[i,] <- df2[i,]
next
}
# if the row has to be grouped, add it to a temporary dataframe
if (df2$Number[i] < 3){
temp_df[present_row_temp_df,] <- df2[i,]
temp_sum <- temp_sum + df2$Number[i]
present_row_temp_df <- present_row_temp_df + 1
# if the rows in the temporary dataframe need to be grouped now
if(temp_sum >= 3){
Terms_new <- paste(temp_df$Terms[1], temp_df$Terms[nrow(temp_df)], sep = "-")
Number_new <- sum(temp_df$Number)
df_new[i, c(1:3)] <- c(df2$CLASSE[i], df2$Variables[i], Terms_new)
df_new[i, 4] <- Number_new
# re-initialize temporary variables
temp_df <- data.frame(
CLASSE = numeric(0),
Variables = character(0),
Terms = character(0),
Number = numeric(0),
stringsAsFactors = FALSE
)
temp_sum <- 0
present_row_temp_df <- 1
}
# for the case in which the last row is not united with the previous rows
if (i == nrow(df2) & df2$Number[i] < 3){
Terms_new <- paste(stringr::str_extract(df_new$Terms[i-1], "^[^-]*"), df2$Terms[i], sep = "-")
Number_new <- df_new$Number[i-1] + df2$Number[i]
df_new[i, c(1:3)] <- c(df_new$CLASSE[i-1], df_new$Variables[i-1], Terms_new)
df_new[i, 4] <- Number_new
df_new[i-1,] <- c("0", "0", "0", 0)
}
}
}
# filter only relevant rows
df_new <- df_new[df_new$Number != 0,]
Result:
df_new
#CLASSE Variables Terms Number
# 1 DAT_1 20160701q 5
# 1 DAT_1 20160802q-20160901q 3
# 1 DAT_2 20161001q-20161201q 3
# 1 DAT_2 20170301q 3
# 2 DAT_1 20161001q-20170301q 4

Related

replace characters in string based on positions from another variable R

I have the below dataframe xo. For each row, I want to find and replace the positions listed in positions_of_Ns_to_remove in sequence. The results new variable in the example should be sequence with all R's removed. I cannot search based on the character itself in this situation - it must be based on the position of the character.
p <- data.frame(locus = c("1","2","3"), positions_of_Ns_to_remove = c("12,17,43,100","30,60,61,62",NA))
x <- data.frame(locus = c("1","1","2","3"), sequence = c("xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"))
xo <- merge(x, p, by = c("locus"), all.x = T)
> xo
locus sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
This works if there is only 1 row in xo, but not when there are multiple rows. I would like to use tidyverse functions / piping and avoid for loops if possible.
xo %>% dplyr::mutate(new_sequence = paste(
replace( unlist(strsplit(sequence, "")), as.integer(unlist(strsplit(positions_of_Ns_to_remove,","))), "" ),
collapse = "")
)
What I want:
locus new_sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
2 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
You could build a custom function and apply it to your data:
library(stringr)
# cuts the n-th character out of the string
remove_pos <- function(string, n) {
n <- as.integer(n)
n <- n[order(n, decreasing = TRUE)]
len <- nchar(string)
output <- string
for (i in n) {
output <- paste0(
str_sub(output, start = 1L, end = i - 1L),
str_sub(output, start = i + 1, end = len)
)
}
return(output)
}
xo %>%
mutate(positions = str_split(positions_of_Ns_to_remove, ",")) %>%
group_by(locus, n=row_number()) %>%
mutate(
new_seq = ifelse(!is.na(positions_of_Ns_to_remove),
remove_pos(sequence, unlist(positions)),
sequence)
) %>%
select(-positions) %>%
ungroup()
which returns
# A tibble: 5 x 4
locus sequence positions_of_Ns_to~ new_seq
<chr> <chr> <chr> <chr>
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxx~ 30,60,61,62 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
4 3 Rxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
5 4 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ NA xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~

Using rep inside sapply to strech a vector according to another vector

I want to generate a data.frame of edges. Problems arise when many edges end on one node. Edges are defined in vectors from and to.
# Data
vertices <- data.frame(id = 1:3, label = c("a", "b", "c"), stringsAsFactors = FALSE)
to <- c("a", "b", "c")
from1 <- c("c", "a", "b")
from2 <- c("c", "a", "a,b,c")
What I tried:
# Attempt 1
create_edges_1 <- function(from, to) {
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from <- sapply(from, function(x){vertices$id[vertices$label == x]})
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
This works for example create_edges_1(from1, to), the output is:
from to
c 3 1
a 1 2
b 2 3
However for example from2 this attempt fails.
So I tried the following:
# Attempt 2
create_edges_2 <- function(from, to) {
to <- sapply(unlist(sapply(strsplit(to, ","), function(x){vertices$id[vertices$label == x]})), function(x){rep(x, sapply(strsplit(from2, ","), length))})
from <- unlist(sapply(strsplit(from2, ","), function(x){vertices$id[vertices$label == x]}))
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
The idea was to "stretch" to for every node where more than one edge ends. However create_edges_2(from1, to) and create_edges_2(from2, to) both throw an error
Error in rep(x, sapply(strsplit(from2, ","), length)) :
invalid 'times' argument
What am I doing wrong in my sapply statements?
The expected output for create_edges_2(from2, to) is:
from to
3 1
1 2
1 3
2 3
3 3
You could use joins or match for this
f2 <- strsplit(from2, ',')
df <- data.frame(from = unlist(f2)
, to = rep(to, lengths(f2))
, stringsAsFactors = FALSE)
With match
library(tidyverse)
map_dfc(df, ~ with(vertices, id[match(.x, label)]))
# # A tibble: 5 x 2
# from to
# <int> <int>
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
With Joins
library(dplyr)
df %>%
inner_join(vertices, by = c(from = 'label')) %>%
inner_join(vertices, by = c(to = 'label')) %>%
select_at(vars(matches('.x|.y')))
# id.x id.y
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
Here is a way:
# Attempt 3
library(dplyr)
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from0 <- sapply(from2, function(x) strsplit(x, ",")) %>% unlist() %>% as.character()
lengths0 <- lapply(sapply(from2, function(x) strsplit(x, ",")), length) %>% unlist()
to0 <- c()
for( i in 1:length(lengths0)) to0 <- c(to0, rep(to[i], lengths0[i]))
from <- sapply(from0, function(x){vertices$id[vertices$label == x]})
edges <- data.frame(from = from, to = to0, stringsAsFactors = FALSE)
edges
Giving this result as requested:
from to
1 3 1
2 1 2
3 1 3
4 2 3
5 3 3
The idea is to split from with comma separators, and to store the size of each element in order to "stretch" every node. Here done with a for loop

Count strings with a certain condition

I have the following dataset
#mydata
Factors Transactions
a,c 2
b 0
c 0
d,a 0
a 1
a 0
b 1
I'd like to count those factors who had transactions.For example, we had two times "a" with transaction. I can write a code to give me my desirable outcome for each variable separately. The following is for "a".
nrow (subset (mydata,mydata$Transaction > 0 & length(mydata[grep("a", mydata$Factors),] )> 0))
But I have too much variables and do not want to repeat a code for all of them. I would think there should be a way to write a code to give me the results for all of the variables. I wish to have the following out put:
#Output
a 2
b 1
c 1
d 0
Equivalent data.table option:
library(data.table)
setDT(df)[, .(Factors = unlist(strsplit(as.character(Factors), ","))),
by = Transactions][,.(Transactions = sum(Transactions > 0)), by = Factors]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
You could create a table using the unique values of the Factor column as the levels. Consider df to be your data set.
s <- strsplit(as.character(df$Factors), ",", fixed = TRUE)
table(factor(unlist(s[df$Transactions > 0]), levels = unique(unlist(s))))
#
# a c b d
# 2 1 1 0
Wrap in as.data.frame() for data frame output.
with(df, {
s <- strsplit(as.character(Factors), ",", fixed = TRUE)
f <- factor(unlist(s[Transactions > 0]), levels = unique(unlist(s)))
as.data.frame(table(Factors = f))
})
# Factors Freq
# 1 a 2
# 2 c 1
# 3 b 1
# 4 d 0
With tidyverse packages, assuming your data is strings/factors and numbers,
library(tidyr)
library(dplyr)
# separate factors with two elements
df %>% separate_rows(Factors) %>%
# set grouping for aggregation
group_by(Factors) %>%
# for each group, count how many transactions are greater than 0
summarise(Transactions = sum(Transactions > 0))
## # A tibble: 4 x 2
## Factors Transactions
## <chr> <int>
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
You could also avoid dplyr by using xtabs, though some cleaning is necessary to get to the same arrangement:
library(tidyr)
df %>% separate_rows(Factors) %>%
xtabs(Transactions > 0 ~ Factors, data = .) %>%
as.data.frame() %>%
setNames(names(df))
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
A full base R equivalent:
df2 <- do.call(rbind,
Map(function(f, t){data.frame(Factors = strsplit(as.character(f), ',')[[1]],
Transactions = t)},
df$Factors, df$Transactions))
df3 <- as.data.frame(xtabs(Transactions > 0 ~ Factors, data = df2))
names(df3) <- names(df)
df3
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
We can use cSplit from splitstackshape to split the 'Factors' into 'long' format and grouped by 'Factors' we get the sum of logical column ('Transactions > 0`).
library(splitstackshape)
cSplit(df1, "Factors", ",", "long")[, .(Transactions=sum(Transactions > 0)),.(Factors)]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
Or using base R
with(df1, table(factor(unlist(strsplit(Factors[Transactions>0], ",")),
levels = letters[1:4]) ))
# a b c d
# 2 1 1 0
data
df1 <- structure(list(Factors = c("a,c", "b", "c", "d,a", "a", "a",
"b"), Transactions = c(2L, 0L, 0L, 0L, 1L, 0L, 1L)), .Names = c("Factors",
"Transactions"), class = "data.frame", row.names = c(NA, -7L))

Rearranging longitudinal data

I have a dataset that is roughly structured like this:
case Year 2001 2002 2003 2004
1 2003 0 0 0 3
2 2002 0 5 3 2
3 2001 3 3 2 2
I am trying to restructure it so that every column represents the first, second (etc.) year counting from the "Year" variable, i.e.:
case Year yr1 yr2 yr3 yr4
1 2003 0 3 0 0
2 2002 5 3 2 0
3 2001 3 3 2 2
This code downloads the dataset and tries the solution suggested by #akrun, but it fails.
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
df1[-(1:2)] <- do.call(rbind,lapply(seq_len(nrow(df1)), function(i) {x <- df1[i, ]; x1 <- unlist(x[-(1:2)]); indx <- which(!is.na(x1))[1]; i <- as.numeric(names(indx))-x[,2]+1; x2 <- x1[!is.na(x1)]; x3 <- rep(NA, length(x1)); x3[i:(i+length(x2)-1)]<- x2; x3}))
This generates:
Error in i:(i + length(x2) - 1) : NA/NaN argument
In addition: Warning message:
In FUN(1:234[[1L]], ...) : NAs introduced by coercion
How can I transform the data so that every column represents the first, second (etc.) year counting from the value in the "Year" variable for each row?
Here's a possibilty:
library(dplyr)
library(reshape2)
df %>%
melt(id.vars = c("case", "Year")) %>%
mutate(variable = as.numeric(as.character(variable)),
yr = variable - Year + 1) %>%
filter(variable >= Year) %>%
dcast(case + Year ~ yr, fill = 0)
# case Year 1 2 3 4
# 1 1 2003 0 3 0 0
# 2 2 2002 5 3 2 0
# 3 3 2001 3 3 2 2
Data:
df <- structure(list(case = 1:3, Year = c(2003L, 2002L, 2001L), `2001` = c(0L,
0L, 3L), `2002` = c(0L, 5L, 3L), `2003` = c(0L, 3L, 2L), `2004` = c(3L,
2L, 2L)), .Names = c("case", "Year", "2001", "2002", "2003",
"2004"), class = "data.frame", row.names = c(NA, -3L))
This should create the manipulation you are looking for.
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
temp <- df1[[1]]
library(dplyr); library(tidyr); library(stringi)
temp <- temp %>%
gather(new.Years, X, -Year) %>% # convert rows to one column
mutate(Year.temp=paste0(rownames(temp), "-", Year)) %>% # concatenate the Year with row number to make them unique
mutate(new.Years = as.numeric(gsub("X", "", new.Years)), diff = new.Years-Year+1) %>% # calculate the difference to get the yr0 yr1 and so on
mutate(diff=paste0("yr", stri_sub(paste0("0", (ifelse(diff>0, diff, 0))), -2, -1))) %>% # convert the differences in Yr01 ...
select(-new.Years) %>% filter(diff != "yr00") %>% # drop new.Years column
spread(diff, X) %>% # convert column to rows
select(-Year.temp) # Drop Year.temp column
temp[is.na(temp)] <- 0 # replace NA with 0
temp %>% View
Notice that this will work for up to 99 years.
Here's a data.table solution:
require(data.table)
require(reshape2)
dt.m = melt(dt, id = 1:2, variable.factor = FALSE)
dt.m[, variable := as.integer(variable)-Year+1L]
dcast.data.table(dt.m, case + Year ~ variable, fill=0L,
value.var = "value", subset = (variable > 0L))
# case Year 1 2 3 4
# 1: 1 2003 0 3 0 0
# 2: 2 2002 5 3 2 0
# 3: 3 2001 3 3 2 2
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")$value
I have an X in the colnames and remove it:
colnames(df1) <- gsub("X", "", colnames(df1))
I have got a solution without any additional packages:
startYear <- as.numeric(colnames(df1)[2])
shifts <- df1$Year - startYear
n <- ncol(df1)
df2 <- df1
colnames(df2)[-1] <- 1:(n-1)
df2[,2:n] <- NA
for(row in 1:nrow(df1)){
if(shifts[row]>=0){
df2[row,2:(n-shifts[row])] <- df1[row, (shifts[row]+2):n]
#df2[row,2:(n-shifts[row])] <- colnames(df1)[(shifts[row]+2):n]
}else{
df2[row, (-shifts[row]+2):n] <- df1[row, 2:(n+shifts[row])]
#df2[row, (-shifts[row]+2):n] <- colnames(df1)[2:(n+shifts[row])]
}
}
You can prefill df2 with 0 instead of NA of corse. Decomment second rows and comment first rows in the ifelse condition to validate the permutation.
Hope it does what you wanted.

Count Complete Cases per Group

I have a big data set (roughly 10 000 rows), and want to create a function that counts the number of complete cases (not NAs) per group. I tried various functions (aggregate, table, sum(complete.cases), group_by, etc), but somehow I miss one - probably little - trick. Thanks for any help!
A little sample data set to explain, the result I need.
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
print(x)
# group age speed
#1 1 4 12
#2 2 3 NA
#3 3 2 15
#4 4 1 NA
#5 1 11 12
#6 2 NA NA
#7 3 13 15
#8 4 NA NA
One function I wrote reads as follows:
CountPerGroup <- function(group) {
data.set <- subset(x,group %in% group)
vect <- vector()
for (i in 1:length(group)) {
vect[i] <- sum(complete.cases(data.set))
}
output <- data.frame(cbind(group,count=vect))
return(output)
}
The result of
CountPerGroup(2:1)
is
group count
1 2 4
2 1 4
Unfortunately, this is wrong. Instead the outcome should look like
group count
1 2 1
2 1 4
What am I missing? How can I tell R to count of complete.cases per Group?
Thank you very much for any help on this!
Something like should do the trick if you wish to maintain your functionality:
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
CountPerGroup <- function(x, groups) {
data.set <- subset(x, group %in% groups)
ans <- sapply(split(data.set, data.set$group),
function(y) sum(complete.cases(y)))
return(data.frame(group = names(ans), count = unname(ans)))
}
CountPerGroup(x, 1:2)
# group count
#1 1 2
#2 2 0
Which is correct from what I can count. But it does not agree with your suggested outcome.
EDIT
It seems that you want the number of non-NA instead and correctly sorted. Use this function instead:
CountPerGroup2 <- function(x, groups) {
data.set <- subset(x, group %in% groups)
ans <- sapply(split(data.set, data.set$group),
function(y) sum(!is.na(y[, !grepl("group", names(y))])))[groups]
return(data.frame(group = names(ans), count = unname(ans)))
}
CountPerGroup2(x, 2:1)
# group count
#1 2 1
#2 1 4
If you are just looking for a way to get the full count of non-NA values per group, you could use something like:
library(plyr)
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
counts <- ddply(x, "group", summarize, count=sum(!is.na(c(age, speed))))
## group count
## 1 1 4
## 2 2 1
## 3 3 4
## 4 4 1
You do miss out on having a function that lets you query a subset of the groups, but you get a one-line way to calculate the full solution.
Here is a way with data.table
library(data.table)
library(functional)
countPerGroup = function(x, vec)
{
dt = data.table(x)
d1 = setkey(dt, group)[group %in% vec]
d2 = d1[,lapply(.SD, Compose(Negate(is.na), sum)),by=group]
transform(d2, count=age+speed, speed=NULL, age=NULL)
}
countPerGroup(x, 1:2)
# group count
#1: 1 4
#2: 2 1
countPerGroup(x, c(1,2))
# group count
#1: 1 4
#2: 2 1
If you have a high number of lines in your data.table, it is particularly efficient!
I just had the same problem and found an easier solution
library(data.table)
x <- data.table(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
x[,sum(complete.cases(.SD)), by=group]

Resources