R Subset matching contiguous blocks - r

I have a dataframe.
dat <- data.frame(k=c("A","A","B","B","B","A","A","A"),
a=c(4,2,4,7,5,8,3,2),b=c(2,5,3,5,8,4,5,8),
stringsAsFactors = F)
k a b
1 A 4 2
2 A 2 5
3 B 4 3
4 B 7 5
5 B 5 8
6 A 8 4
7 A 3 5
8 A 2 8
I would like to subset contiguous blocks based on variable k. This would be a standard approach.
#using rle rather than levels
kval <- rle(dat$k)$values
for(i in 1:length(kval))
{
subdf <- subset(dat,dat$k==kval[i])
print(subdf)
#do something with subdf
}
k a b
1 A 4 2
2 A 2 5
6 A 8 4
7 A 3 5
8 A 2 8
k a b
3 B 4 3
4 B 7 5
5 B 5 8
k a b
1 A 4 2
2 A 2 5
6 A 8 4
7 A 3 5
8 A 2 8
So the subsetting above obviously does not work the way I intended. Any elegant way to get these results?
k a b
1 A 4 2
2 A 2 5
k a b
1 B 4 3
2 B 7 5
3 B 5 8
k a b
1 A 8 4
2 A 3 5
3 A 2 8

We can use rleid from data.table to create a grouping variable
library(data.table)
setDT(dat)[, grp := rleid(k)]
dat
# k a b grp
#1: A 4 2 1
#2: A 2 5 1
#3: B 4 3 2
#4: B 7 5 2
#5: B 5 8 2
#6: A 8 4 3
#7: A 3 5 3
#8: A 2 8 3
We can group by 'grp' and do all the operations within the 'grp' using standard data.table methods.
Here is a base R option to create 'grp'
dat$grp <- with(dat, cumsum(c(TRUE, k[-1]!= k[-length(k)])))

Related

Creating two columns of cumulative sum based on the categories of one column

I like to create two columns with cumulative frequency of "A" and "B" in the assignment columns.
df = data.frame(id = 1:10, assignment= c("B","A","B","B","B","A","B","B","A","B"))
id assignment
1 1 B
2 2 A
3 3 B
4 4 B
5 5 B
6 6 A
7 7 B
8 8 B
9 9 A
10 10 B
The resulting table would have this format
id assignment A B
1 1 B 0 1
2 2 A 1 1
3 3 B 1 2
4 4 B 1 3
5 5 B 1 4
6 6 A 2 4
7 7 B 2 5
8 8 B 2 6
9 9 A 3 6
10 10 B 3 7
How to generalize the codes for more than 2 categories (say for "A","B",C")?
Thanks
Use lapply over unique values in assignment to create new columns.
vals <- sort(unique(df$assignment))
df[vals] <- lapply(vals, function(x) cumsum(df$assignment == x))
df
# id assignment A B
#1 1 B 0 1
#2 2 A 1 1
#3 3 B 1 2
#4 4 B 1 3
#5 5 B 1 4
#6 6 A 2 4
#7 7 B 2 5
#8 8 B 2 6
#9 9 A 3 6
#10 10 B 3 7
We can use model.matrix with colCumsums
library(matrixStats)
cbind(df, colCumsums(model.matrix(~ assignment - 1, df[-1])))
A base R option
transform(
df,
A = cumsum(assignment == "A"),
B = cumsum(assignment == "B")
)
gives
id assignment A B
1 1 B 0 1
2 2 A 1 1
3 3 B 1 2
4 4 B 1 3
5 5 B 1 4
6 6 A 2 4
7 7 B 2 5
8 8 B 2 6
9 9 A 3 6
10 10 B 3 7

merge/join two long df in R

I have two dataframes a and b which I would like to combine
a <- data.frame(g=c("1","2","2","3","3","3","4","4","4","4"),h=c("1","1","2","1","2","3","1","2","3","4"))
b <- data.frame(g=c("1","2","3","3","3","4","4","4","4","4"),i=c("1","2","3","2","1","2","3","4","5","6"))
g represents a grouping variable and h and i the columns I want to merge/join
> a
g h
1 1 1
2 2 1
3 2 2
4 3 1
5 3 2
6 3 3
7 4 1
8 4 2
9 4 3
10 4 4
> b
g i
1 1 1
2 2 2
3 3 3
4 3 2
5 3 1
6 4 2
7 4 3
8 4 4
9 4 5
10 4 6
a and b should be merged on the level of the grouping variable g whereas identical values of h and i should be put together (independant of the order they appear in h/i) and not identical values should be combined once (not all possible combinations).
a final df would look like:
g h i
1 1 1 1
2 2 1 <NA>
3 2 2 2
4 3 1 1
5 3 2 2
6 3 3 3
7 4 1 <NA>
8 4 2 2
9 4 3 3
10 4 4 4
11 4 <NA> 5
12 4 <NA> 6
I need that df to perform a correlation analysis.
Sounds like a merge on h==i, while retaining i, so create a new variable x to join on, and keep join results from both sides (all=TRUE). With a large hat-tip to #Moody_Mudskipper:
merge(transform(a,x=h), transform(b,x=i), all=TRUE)
# g x h i
#1 1 1 1 1
#2 2 1 1 <NA>
#3 2 2 2 2
#4 3 1 1 1
#5 3 2 2 2
#6 3 3 3 3
#7 4 1 1 <NA>
#8 4 2 2 2
#9 4 3 3 3
#10 4 4 4 4
#11 4 5 <NA> 5
#12 4 6 <NA> 6
We can also do this with dplyr
library(dplyr)
a %>%
mutate(x = h) %>%
full_join(mutate(b, x = i)) %>%
select(-x)

r - dedupe the rows with value in dataframe

How to subset only the rows with values in a particular column among the duplicates based on another column.
Example:
df
A B C D
1 NA 8 7
1 5 8 9
2 6 5 8
2 NA 5 6
3 NA 8 5
So in the above dataset, first 4 rows are duplicate based on column A and C, so among them, I want to choose only the rows which has value in column B.
Desired output,
A B C D
1 5 8 9
2 6 5 8
3 NA 8 5
Thanks.
Using dplyr:
df <- read.table(text="A B C D
1 NA 8 7
1 5 8 9
2 6 5 8
2 NA 5 6
3 NA 8 5", header=T)
df %>%
group_by(A,C) %>%
filter(n()==1|!is.na(B))
A B C D
<int> <int> <int> <int>
1 1 5 8 9
2 2 6 5 8
3 3 NA 8 5
Duplicates back or forwards and not missing on B; or not a duplicate:
anydup <- duplicated(df[c("A","C")]) | duplicated(df[c("A","C")], fromLast=TRUE)
df[(anydup & (!is.na(df$B))) | (!anydup),]
# A B C D
#2 1 5 8 9
#3 2 6 5 8
#5 3 NA 8 5
Or use ave to check the length per group as per #HubertL's dplyr answer:
df[!is.na(df$B) | ave(df$B, df[c("A","C")], FUN=length)==1,]
# A B C D
#2 1 5 8 9
#3 2 6 5 8
#5 3 NA 8 5
Here is one option with data.table
library(data.table)
setDT(df)[df[, .I[.N==1 | complete.cases(B)] , .(A, C)]$V1]
# A B C D
#1: 1 5 8 9
#2: 2 6 5 8
#3: 3 NA 8 5

Create a new variable which count length of duplicate in R

I have a data frame,I want to create a variable z,count duplicate of "y variable", if y have 1,1 set z = 2,2, if y have 3,3,3, set z = 3,3,3.
x = c("a","b","c","d","e","a","b","c","d","e","a","b","c")
y = c(1,1,2,2,2,3,3,4,4,4,5,5,5)
data <- data.frame(x,y)
data
x y z
1 a 1 2
2 b 1 2
3 c 2 3
4 d 2 3
5 e 2 3
6 a 3 2
7 b 3 2
8 c 4 3
9 d 4 3
10 e 4 3
11 a 5 3
12 b 5 3
13 c 5 3
Thanks for your help.
You can try the rle:
data$z <- with(data, unlist(mapply(rep, rle(y)$lengths, rle(y)$lengths)))
data
x y z
1 a 1 2
2 b 1 2
3 c 2 3
4 d 2 3
5 e 2 3
6 a 3 2
7 b 3 2
8 c 4 3
9 d 4 3
10 e 4 3
11 a 5 3
12 b 5 3
13 c 5 3
If your your variable y is sorted as an increasing sequence as you say, then the following solution will work:
# calculate counts of each level
counts <- table(data$y)
# fill in z
data$z <- counts[match(data$y, names(counts))]
Note, however, that this method will fail if y is not ordered and, since you want to restart the count when a different level occurs. For these purposes, #psidom's solution is more robust to mis-ordered data as rle will reset the count.
This method calculates the total occurrences of a level and then feeds these total counts to the proper location using match.
Here is a quick method using dplyr, and its rather intuitive syntax:
library(dplyr)
left_join(data, data %>%
group_by(y) %>%
summarize(z = n()),
by = "y")
x y z
1 a 1 2
2 b 1 2
3 c 2 3
4 d 2 3
5 e 2 3
6 a 3 2
7 b 3 2
8 c 4 3
9 d 4 3
10 e 4 3
11 a 5 3
12 b 5 3
13 c 5 3
We can do this easily with data.table
library(data.table)
setDT(data)[, z := .N , rleid(y)]
data
# x y z
# 1: a 1 2
# 2: b 1 2
# 3: c 2 3
# 4: d 2 3
# 5: e 2 3
# 6: a 3 2
# 7: b 3 2
# 8: c 4 3
# 9: d 4 3
#10: e 4 3
#11: a 5 3
#12: b 5 3
#13: c 5 3
Or using rle from base R without any loops
inverse.rle(within.list(rle(data$y), values <- lengths))
#[1] 2 2 3 3 3 2 2 3 3 3 3 3 3
Or another base R method with ave
with(data, ave(y, cumsum(c(TRUE, y[-1]!= y[-length(y)])), FUN=length))
#[1] 2 2 3 3 3 2 2 3 3 3 3 3 3

subset function in R with more than one conditions [duplicate]

I have this data.frame:
a <- c(rep("1", 3), rep("2", 3), rep("3",3), rep("4",3), rep("5",3))
b <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
df <-data.frame(a,b)
a b
1 1 1
2 1 2
3 1 3
4 2 4
5 2 5
6 2 6
7 3 7
8 3 8
9 3 9
10 4 10
11 4 11
12 4 12
13 5 13
14 5 14
15 5 15
I want to have something like this:
a <- c(rep("2", 3), rep("3", 3))
b <- c(4,5,6,7,8,9)
dffinal<-data.frame(a,b)
a b
1 2 4
2 2 5
3 2 6
4 3 7
5 3 8
6 3 9
I could use the "subset" function, but its not working
sub <- subset(df,c(2,3) == a )
a b
5 2 5
8 3 8
This command only takes one row of "2" and "3" in column "a".
Any Help?
You're confusing == with %in%:
subset(df, a %in% c(2,3))
# a b
# 4 2 4
# 5 2 5
# 6 2 6
# 7 3 7
# 8 3 8
# 9 3 9
what about this?
library(dplyr)
df %>% filter(a == 2 | a==3)
a b
1 2 4
2 2 5
3 2 6
4 3 7
5 3 8
6 3 9
We can use data.table. We convert the 'data.frame' to 'data.table' (setDT(df)), and set the 'key' as column 'a', then we subset the rows.
library(data.table)
setDT(df, key= 'a')[c('2','3')]
# a b
#1: 2 4
#2: 2 5
#3: 2 6
#4: 3 7
#5: 3 8
#6: 3 9

Resources