remove i+1th term if reoccuring - r

Say we have the following data
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
How would one write a function so that for A, if we have the same value in the i+1th position, then the reoccuring row is removed.
Therefore the output should like like
data.frame(c(1,2,3,4,8,6,1,2,3,4), c(1,2,5,1,2,3,5,1,2,3))
My best guess would be using a for statement, however I have no experience in these

You can try
data[c(TRUE, data[-1,1]!= data[-nrow(data), 1]),]

Another option, dplyr-esque:
library(dplyr)
dat1 <- data.frame(A=c(1,2,2,2,3,4,8,6,6,1,2,3,4),
B=c(1,2,3,4,5,1,2,3,4,5,1,2,3))
dat1 %>% filter(A != lag(A, default=FALSE))
## A B
## 1 1 1
## 2 2 2
## 3 3 5
## 4 4 1
## 5 8 2
## 6 6 3
## 7 1 5
## 8 2 1
## 9 3 2
## 10 4 3

using diff, which calculates the pairwise differences with a lag of 1:
data[c( TRUE, diff(data[,1]) != 0), ]
output:
A B
1 1 1
2 2 2
5 3 5
6 4 1
7 8 2
8 6 3
10 1 5
11 2 1
12 3 2
13 4 3

Using rle
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
X <- rle(data$A)
Y <- cumsum(c(1, X$lengths[-length(X$lengths)]))
View(data[Y, ])
row.names A B
1 1 1 1
2 2 2 2
3 5 3 5
4 6 4 1
5 7 8 2
6 8 6 3
7 10 1 5
8 11 2 1
9 12 3 2
10 13 4 3

Related

join columns recursively in R

Hello I have a data frame of 245 columns but to add some sets and generate new columns try to do it recursively as follows
cl1<-sample(1:4,10,replace=TRUE)
cl2<-sample(1:4,10,replace=TRUE)
cl3<-sample(1:4,10,replace=TRUE)
cl4<-sample(1:4,10,replace=TRUE)
cl5<-sample(1:4,10,replace=TRUE)
cl6<-sample(1:4,10,replace=TRUE)
dat<-data.frame(cl1,cl2,cl3,cl4,cl5,cl6)
my intention is to add column 1 with column 3 and 5, likewise column 2 with 4 and 6 and in the end obtain a dataframe with two columns
and you should pay me something like that
I have programmed the following code
revisar<- function(a){
todos = list()
i=1
j=3
l=5
k=1
while(i<=2 ){
cl<-a[,i]
cl2<-a[,j]
cl3<-a[,l]
cl[is.na(cl)] <- 0
cl2[is.na(cl2)] <- 0
cl3[is.na(cl3)] <- 0
colu<-cl+cl2+cl3
col<-cbind(colu,colu)
i<-i+1
j<-j+1
l<-l+1
k<-k+1
}
return(col)
}
it turns out that it only returns column 2 repeated twice and I must replicate the same thing to join those 245 columns.7
I would like to know what is failing the example
base R
Literal programming:
with(dat, data.frame(s1 = cl1+cl3+cl5, s2 = cl2+cl4+cl6))
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
Programmatically,
L <- list(s1 = c(1,3,5), s2 = c(2,4,6))
out <- data.frame(lapply(L, function(z) do.call(rowSums, list(as.matrix(dat[,z])))))
out
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
dplyr
library(dplyr)
dat %>%
transmute(
s1 = rowSums(cbind(cl1, cl3, cl5)),
s2 = rowSums(cbind(cl2, cl4, cl6))
)
or programmatically using purrr:
purrr::map_dfc(L, ~ rowSums(dat[, .]))
Data
set.seed(42)
# your `dat` above
Here is an alternative general approach:
Here we sum all uneven columns -> s1 and
all even columns -> s2:
library(dplyr)
dat %>%
rowwise() %>%
mutate(s1 = sum(c_across(seq(1,ncol(dat),2)), na.rm = TRUE),
s2 = sum(c_across(seq(2,ncol(dat),2)), na.rm = TRUE))
cl1 cl2 cl3 cl4 cl5 cl6 s1 s2
<int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 3 2 3 2 7 5
2 2 4 1 4 2 3 5 11
3 2 2 2 2 1 3 5 7
4 2 4 4 3 1 4 7 11
5 2 4 4 3 2 2 8 9
6 3 3 3 2 2 2 8 7
7 2 1 1 2 1 4 4 7
8 2 4 1 3 2 3 5 10
9 3 1 1 2 3 4 7 7
10 2 4 1 3 4 4 7 11

Remove identical values if the same as previous in a time series

I have a time series:
df <- data.frame(t=1:10, x= c(5,7,8,9,5,5,5,5,4,3))
I want to remove values that are identical to the previous value to obtain:
x = c(5,7,8,9,5,4,3)
I tried:
df[unique(df$x),]
But this gives the incorrect answer.
You can do:
df[c(1, diff(df$x)) != 0, ]
t x
1 1 5
2 2 7
3 3 8
4 4 9
5 5 5
6 9 4
7 10 3
With dplyr, you can do:
df %>%
filter(x != lag(x, default = first(x)-1))
t x
1 1 5
2 2 7
3 3 8
4 4 9
5 5 5
6 9 4
7 10 3
In base R, we can use head and tail
subset(df, c(TRUE, head(x, -1) != tail(x, -1)))
# t x
#1 1 5
#2 2 7
#3 3 8
#4 4 9
#5 5 5
#9 9 4
#10 10 3
Another base solution would be using rle.
If you want to subset the dataframe based on the criteria, you can use lengths. Otherwise, if you only need the subset of x column, we should extract the values from rle. See below;
df[cumsum(rle(df$x)$lengths), ] # dataframe subset
# t x
# 1 1 5
# 2 2 7
# 3 3 8
# 4 4 9
# 8 8 5
# 9 9 4
# 10 10 3
rle(df$x)$values # vector of values
# [1] 5 7 8 9 5 4 3
Or using data.table:
library(data.table)
setDT(df_large)[, rn :=1:.N, by = rleid(x)][rn == 1, .(t, x)]
# t x
# 1: 1 5
# 2: 2 7
# 3: 3 8
# 4: 4 9
# 5: 5 5
# 6: 9 4
# 7: 10 3
library(dplyr)
df <- data.frame(t=1:10, x= c(5,7,8,9,5,5,5,5,4,3))
subsetVec <- df$x - lag(df$x) != 0
subsetVec <- replace_na(subsetVec, TRUE)
df[subsetVec,]

numbering duplicated rows in dplyr [duplicate]

This question already has answers here:
Using dplyr to get cumulative count by group
(3 answers)
Closed 5 years ago.
I come to an issue with numbering the duplicated rows in data.frame and could not find a similar post.
Let's say we have a data like this
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
> df
gr x
1 1 a
2 1 a
3 2 b
4 2 b
5 3 c
6 3 c
7 4 a
8 4 a
9 5 c
10 5 c
11 6 d
12 6 d
13 7 a
14 7 a
and want to add new column called x_dupl to show that first occurrence of x values is numbered as 1 and second time 2 and third time 3 and so on..
thanks in advance!
The expected output
> df
gr x x_dupl
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
Your example data (plus rows where gr = 7 as in your output), and named df1, not df:
df1 <- data.frame(gr = gl(7,2),
x = c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
library(dplyr)
df1 %>%
group_by(x) %>%
mutate(x_dupl = dense_rank(gr)) %>%
ungroup()
# A tibble: 14 x 3
gr x x_dupl
<fctr> <fctr> <int>
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
A base R solution:
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
x <- rle(as.numeric(df$x))
x$values <- ave(x$values, x$values, FUN = seq_along)
df$x_dupl <- inverse.rle(x)
# gr x x_dupl
# 1 1 a 1
# 2 1 a 1
# 3 2 b 1
# 4 2 b 1
# 5 3 c 1
# 6 3 c 1
# 7 4 a 2
# 8 4 a 2
# 9 5 c 2
# 10 5 c 2
# 11 6 d 1
# 12 6 d 1
# 13 7 a 3
# 14 7 a 3

Creating a new column using for/nested loop in r

Just getting started using R and I need some help in understanding the application of for/nested loop.
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
How can I create a new column called as ID, which would use the combination of studyID and subjectID to create a unique ID ?
So for this data, unique ID should be from 1:25.
So the final data looks like this:
UniqueID<- c(1:25)
df<-cbind(df,UniqueID)
View(df)
Is there any other way which is faster and more efficient that looping ?
Using the dplyr package, you could do:
library(dplyr)
df$Id = group_indices(df,StudyID,SubjectID)
This returns:
#StudyID SubjectID Id
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
Another method to achieve that without loading any library (base R) would be this (assuming data frame is sorted based on the two columns):
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
df$uniqueID <- cumsum(!duplicated(df[1:2]))
or you can use this solution, mentioned in the comments (I prefer this over the first solution):
df$uniqueID <- as.numeric(factor(do.call(paste, df)))
The output would be:
> print(df, row.names = FALSE)
#StudyID SubjectID uniqueID
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 2 12
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
You could go for interaction in base R:
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
For example (this example expresses better what you are after):
set.seed(10)
df <- data.frame(StudyID=sample(5,10,replace = T), SubjectID=rep(1:5,times=2))
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
# StudyID SubjectID uniqueID
# 1 3 1 3
# 2 2 2 6
# 3 3 3 11
# 4 4 4 16
# 5 1 5 17
# 6 2 1 2
# 7 2 2 6
# 8 2 3 10
# 9 4 4 16
# 10 3 5 19

Assign value to group based on condition in column

I have a data frame that looks like the following:
> df = data.frame(group = c(1,1,1,2,2,2,3,3,3),
date = c(1,2,3,4,5,6,7,8,9),
value = c(3,4,3,4,5,6,6,4,9))
> df
group date value
1 1 1 3
2 1 2 4
3 1 3 3
4 2 4 4
5 2 5 5
6 2 6 6
7 3 7 6
8 3 8 4
9 3 9 9
I want to create a new column that contains the date value per group that is associated with the value "4" from the value column.
The following data frame shows what I hope to accomplish.
group date value newValue
1 1 1 3 2
2 1 2 4 2
3 1 3 3 2
4 2 4 4 4
5 2 5 5 4
6 2 6 6 4
7 3 7 6 8
8 3 8 4 8
9 3 9 9 8
As we can see, group 1 has the newValue "2" because that is the date associated with the value "4". Similarly, group two has newValue 4 and group three has newValue 8.
I assume there is an easy way to do this using ave() or a range of dplyr/data.table functions, but I have been unsuccessful with my many attempts.
Here's a quick data.table one
library(data.table)
setDT(df)[, newValue := date[value == 4L], by = group]
df
# group date value newValue
# 1: 1 1 3 2
# 2: 1 2 4 2
# 3: 1 3 3 2
# 4: 2 4 4 4
# 5: 2 5 5 4
# 6: 2 6 6 4
# 7: 3 7 6 8
# 8: 3 8 4 8
# 9: 3 9 9 8
Here's a similar dplyr version
library(dplyr)
df %>%
group_by(group) %>%
mutate(newValue = date[value == 4L])
Or a possible base R solution using merge after filtering the data (will need some renaming afterwards)
merge(df, df[df$value == 4, c("group", "date")], by = "group")
Here is a base R option
df$newValue = rep(df$date[which(df$value == 4)], table(df$group))
Another alternative using lapply
do.call(rbind, lapply(split(df, df$group),
function(x){x$newValue = rep(x$date[which(x$value == 4)],
each = length(x$group)); x}))
# group date value newValue
#1.1 1 1 3 2
#1.2 1 2 4 2
#1.3 1 3 3 2
#2.4 2 4 4 4
#2.5 2 5 5 4
#2.6 2 6 6 4
#3.7 3 7 6 8
#3.8 3 8 4 8
#3.9 3 9 9 8
One more base R path:
df$newValue <- ave(`names<-`(df$value==4,df$date), df$group, FUN=function(x) as.numeric(names(x)[x]))
df
group date value newValue
1 1 1 3 2
2 1 2 4 2
3 1 3 3 2
4 2 4 4 4
5 2 5 5 4
6 2 6 6 4
7 3 7 6 8
8 3 8 4 8
9 3 9 9 8
10 3 11 7 8
I used a test on variable length groups. I assigned the date column as the names for the logical index of value equal to 4. Then identify the value by group.
Data
df = data.frame(group = c(1,1,1,2,2,2,3,3,3,3),
date = c(1,2,3,4,5,6,7,8,9,11),
value = c(3,4,3,4,5,6,6,4,9,7))

Resources