Find corresponding "father" row in dataframe - r

I've a dataframe with A and B rows. Bs are children of A.
For each B line I need to write the corresponding A father.
The number of Bs for each A is variable.
I'm thinking about a for cycle but I don't think it's the right way....
Here a simplified example:
> df <- data.frame(Index=c(1:9),
+ Type=c("A","B","A","B","B","B","A","B","B"),
+ Aindex="")
> df
Index Type Aindex
1 1 A
2 2 B
3 3 A
4 4 B
5 5 B
6 6 B
7 7 A
8 8 B
9 9 B
This is the result I'd like to have:
> df2
Index Type Aindex
1 1 A
2 2 B 1
3 3 A
4 4 B 3
5 5 B 3
6 6 B 3
7 7 A
8 8 B 7
9 9 B 7

here in base R cumsum() is really great for this i use it always to find parent child relations
df <- data.frame(Index=c(1:9), Type=c("A","B","A","B","B","B","A","B","B"))
df$parent <- df$Type == "A"
df$aindex <- cumsum(df$parent)
df$aindex[df$Type == "A"] <- ""
df$aindex[df$aindex > 0] <- df$Index[df$Type == "A"][as.numeric(df$aindex[df$aindex > 0])]
result
Index Type parent aindex
1 1 A TRUE
2 2 B FALSE 1
3 3 A TRUE
4 4 B FALSE 3
5 5 B FALSE 3
6 6 B FALSE 3
7 7 A TRUE
8 8 B FALSE 7
9 9 B FALSE 7

You can use tidyr::fill :
library(dplyr)
library(tidyr)
df %>%
#Turn Aindex to NA if type = 'B'
mutate(Aindex = replace(Index, Type == 'B', NA)) %>%
#fill NA with value above it
fill(Aindex) %>%
#Change the Aindex to empty value where Type = 'A'
mutate(Aindex = replace(Aindex, Type == 'A', ''))
# Index Type Aindex
#1 1 A
#2 2 B 1
#3 3 A
#4 4 B 3
#5 5 B 3
#6 6 B 3
#7 7 A
#8 8 B 7
#9 9 B 7

Related

Keep rows with specific string and the following row

This is my data frame
df <- data.frame(
id = 1:14,
group_id = c(rep(1:2, each = 3), rep(3:4, each = 4)),
type = rep("A", 14), stringsAsFactors = FALSE)
df[c(2,4,8,12),"type"] <- "B"
id group_id type
1 1 1 A
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
6 6 2 A
7 7 3 A
8 8 3 B
9 9 3 A
10 10 3 A
11 11 4 A
12 12 4 B
13 13 4 A
14 14 4 A
I'd like to keep all rows with type B as well as the following row.
I could do...
B <- which(df$type=="B")
afterB <- B+1
df_sel <- df[c(B, afterB), ]
df_sel <- df_sel[order(df_sel$id),]
df_sel
...to get what I want.
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A
How can this be done in a more generic way.
Another way, very similar to what you do but in one step and without the need to reorder:
df_sel <- df[rep(which(df$type=="B"), e=2)+c(0, 1), ]
df_sel
# id group_id type
# 2 2 1 B
# 3 3 1 A
# 4 4 2 B
# 5 5 2 A
# 8 8 3 B
# 9 9 3 A
# 12 12 4 B
# 13 13 4 A
Using lag from dplyr
library(dplyr)
df[df$type == "B" | lag(df$type == "B", default = FALSE), ]
# id group_id type
#2 2 1 B
#3 3 1 A
#4 4 2 B
#5 5 2 A
#8 8 3 B
#9 9 3 A
#12 12 4 B
#13 13 4 A
using grep will provide a row index of all instances of B - rows; concatenate (c()) this with rows + 1 to select from df will work.
rows <- grep("B", df[, "type"])
df[sort(c(rows, rows + 1)), ]
gives:
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A

How to assign a value to a column based on a column index

Having a data frame I would like to assign a calculated value based on a given a column index
df <- data.frame(a = c(2,4,7,3,5,3), b = c(8,3,8,2,6,1))
> df
a b
1 2 8
2 4 3
3 7 8
4 3 2
5 5 6
6 3 1
max <- apply(df, 1, which.max)
> max
[1] 2 1 2 1 2 1
addition <- apply(df, 1, sum)
> addition
[1] 10 7 15 5 11 4
Then some operation which I cannot figure out with the following result being assigned to df2
> df2
a b
1 2 10
2 7 3
3 7 15
4 5 2
5 5 11
6 4 1
highly appreciate your ideas and your help. Thank you
You can use cbind to access your selected columns for each row:
df2 = df
df2[cbind(1:nrow(df2),max)] = addition
df2
a b
1 2 10
2 7 3
3 7 15
4 5 2
5 5 11
6 4 1
Here, cbind returns a matrix of 2 columns and 6 rows that we use to subset the dataframe using matrix subsetting.
You can also use vectorised ifelse directly:
with(df, cbind.data.frame(a = ifelse(a > b, a + b, a), b = ifelse(a > b, b, a + b)));
# a b
#1 2 10
#2 7 3
#3 7 15
#4 5 2
#5 5 11
#6 4 1

Extract Index of repeat value

how do I extract specific row of data when the column has repetitive value? my data looks like this: I want to extract the row of the end of each repeat of x (A 3 10, A 2 3 etc) or the index of the last value
Name X M
A 1 1
A 2 9
A 3 10
A 1 1
A 2 3
A 1 5
A 2 6
A 3 4
A 4 5
A 5 3
B 1 1
B 2 9
B 3 10
B 1 1
B 2 3
Expected output
Index Name X M
3 A 3 10
5 A 2 3
10 A 5 3
13 B 3 10
15 B 2 3
Using base R duplicated and cumsum:
dups <- !duplicated(cumsum(dat$X == 1), fromLast=TRUE)
cbind(dat[dups,], Index=which(dups))
# Name X M Index
#3 A 3 10 3
#5 A 2 3 5
#10 A 5 3 10
#13 B 3 10 13
#15 B 2 3 15
A solution using dplyr.
library(dplyr)
df2 <- df %>%
mutate(Flag = ifelse(lead(X) < X, 1, 0)) %>%
mutate(Index = 1:n()) %>%
filter(Flag == 1 | is.na(Flag)) %>%
select(Index, X, M)
df2
# Index X M
# 1 3 3 10
# 2 5 2 3
# 3 10 5 3
# 4 13 3 10
# 5 15 2 3
Flag is a column showing if the next number in A is smaller than the previous number. If TRUE, Flag is 1, otherwise is 0. We can then filter for Flag == 1 or where Flag is NA, which is the last row. df2 is the final filtered data frame.
DATA
df <- read.table(text = "Name X M
A 1 1
A 2 9
A 3 10
A 1 1
A 2 3
A 1 5
A 2 6
A 3 4
A 4 5
A 5 3
B 1 1
B 2 9
B 3 10
B 1 1
B 2 3",
header = TRUE, stringsAsFactors = FALSE)

Remove semi duplicate rows in R

I have the following data.frame.
a <- c(rep("A", 3), rep("B", 3), rep("C",2), "D")
b <- c(NA,1,2,4,1,NA,2,NA,NA)
c <- c(1,1,2,4,1,1,2,2,2)
d <- c(1,2,3,4,5,6,7,8,9)
df <-data.frame(a,b,c,d)
a b c d
1 A NA 1 1
2 A 1 1 2
3 A 2 2 3
4 B 4 4 4
5 B 1 1 5
6 B NA 1 6
7 C 2 2 7
8 C NA 2 8
9 D NA 2 9
I want to remove duplicate rows (based on column A & C) so that the row with values in column B are kept. In this example, rows 1, 6, and 8 are removed.
One way to do this is to order by 'a', 'b' and the the logical vector based on 'b' so that all 'NA' elements will be last for each group of 'a', and 'b'. Then, apply the duplicated and keep only the non-duplicate elements
df1 <- df[order(df$a, df$b, is.na(df$b)),]
df2 <- df1[!duplicated(df1[c('a', 'c')]),]
df2
# a b c d
#2 A 1 1 2
#3 A 2 2 3
#5 B 1 1 5
#4 B 4 4 4
#7 C 2 2 7
#9 D NA 2 9
setdiff(seq_len(nrow(df)), row.names(df2) )
#[1] 1 6 8
First create two datasets, one with duplicates in column a and one without duplicate in column a using the below function :
x = df[df$a %in% names(which(table(df$a) > 1)), ]
x1 = df[df$a %in% names(which(table(df$a) ==1)), ]
Now use na.omit function on data set x to delete the rows with NA and then rbind x and x1 to the final data set.
rbind(na.omit(x),x1)
Answer:
a b c d
2 A 1 1 2
3 A 2 2 3
4 B 4 4 4
5 B 1 1 5
7 C 2 2 7
9 D NA 2 9
You can use dplyr to do this.
df %>% distinct(a, c, .keep_all = TRUE)
Output
a b c d
1 A NA 1 1
2 A 2 2 3
3 B 4 4 4
4 B 1 1 5
5 C 2 2 7
6 D NA 2 9
There are other options in dplyr, check this question for details: Remove duplicated rows using dplyr

R add columns indicating start and end for a sequence within columns

I have data as below.
names=c(rep("a",4),rep("b",5),rep("c",2))
time=c(1,2,3,4,1,2,3,4,5,1,2)
dd=data.frame(names,time)
dd <- group_by(dd, names)
dd <- mutate(dd, seq=seq_along(names))
extr <- summarise(dd, minw=min(time), maxw=max(time))
> dd
Source: local data frame [11 x 3]
Groups: names
names time seq
1 a 1 1
2 a 2 2
3 a 3 3
4 a 4 4
5 b 1 1
6 b 2 2
7 b 3 3
8 b 4 4
9 b 5 5
10 c 1 1
11 c 2 2
> extr
Source: local data frame [3 x 3]
names minw maxw
1 a 1 4
2 b 1 5
3 c 1 2
The final output that I need is as below. I want to add two columns - first_indicator and last_indicator which will have value "yes" if combination of name and sequence have the first and last value respectively. How could i do it using dd and extr dataframes that i am generating above?
names time seq first_indicator last_indicator
1 a 1 1 yes
2 a 2 2
3 a 3 3
4 a 4 4 yes
5 b 1 1 yes
6 b 2 2
7 b 3 3
8 b 4 4
9 b 5 5 yes
10 c 1 1 yes
11 c 2 2 yes
In base R, use ave:
dd$first <- dd$time==1 #1 is always the start of a group
dd$last <- dd$time==ave(dd$time,dd$names,FUN=max) #check against max group value
# names time first last
#1 a 1 TRUE FALSE
#2 a 2 FALSE FALSE
#3 a 3 FALSE FALSE
#4 a 4 FALSE TRUE
#5 b 1 TRUE FALSE
#6 b 2 FALSE FALSE
#7 b 3 FALSE FALSE
#8 b 4 FALSE FALSE
#9 b 5 FALSE TRUE
#10 c 1 TRUE FALSE
#11 c 2 FALSE TRUE
Using data.table you could do something like:
library(data.table)
setDT(dd)[,c("first","last") := list(time==1,time==.N), by=names]
You could do:
dd %>%
group_by(names) %>%
mutate(first = ifelse(first(time) == time, "yes", ""),
last = ifelse(last(time) == time, "yes", ""))
Which gives:
#Source: local data frame [11 x 4]
#Groups: names
#
# names time first last
#1 a 1 yes
#2 a 2
#3 a 3
#4 a 4 yes
#5 b 1 yes
#6 b 2
#7 b 3
#8 b 4
#9 b 5 yes
#10 c 1 yes
#11 c 2 yes
Perhaps not the most elegant answer, but...
extr$first_indicator <- rep("yes", nrow(extr))
extr$last_indicator <- rep("yes", nrow(extr))
dd <- merge(dd, extr[c(1,2,4)], by.x = 1:2, by.y = 1:2, all = TRUE)
dd <- merge(dd, extr[c(1,3,5)], by.x = c(1,3), by.y = 1:2, all = TRUE)
Should work. You could, of course, wrap it all in a function if you needed to do this multiple times.

Resources