How to assign sequence number for a column with duplicated rows - r

I have a dataset where I want to assign sequence number to a column with duplicated rows, for example:
variable_1
x
x
y
y
x
x
x
z
z
z
How do I get the result like this:
variable_1 sequence
x 1
y 2
x 3
z 4
I tried using unique, but I would miss the sequence number for x showing up the second time.

A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
mutate(sequence = rleid(variable_1)) %>%
distinct()
df2
# variable_1 sequence
# 1 x 1
# 2 y 2
# 3 x 3
# 4 z 4
DATA
df <- read.table(text = "
variable_1
x
x
y
y
x
x
x
z
z
z
", header = TRUE, stringsAsFactors = FALSE)

A dplyr solution:
library(dplyr)
df = read.table(text = "
variable_1
x
x
y
y
x
x
x
z
z
z
", header=T, stringsAsFactors=F)
df %>%
mutate(flag = if_else(variable_1 != lag(variable_1), 1, 0, missing = 1), # flag row when variable changes
sequence = cumsum(flag)) %>% # create a group using the flags
distinct(variable_1, sequence) # get unique values
# variable_1 sequence
# 1 x 1
# 2 y 2
# 3 x 3
# 4 z 4

In base R
v=(df$variable_1!=lag(df$variable_1))
v[is.na(v)]=T
df$sequence =cumsum(v)
df[!duplicated(df),]
variable_1 sequence
1 x 1
3 y 2
5 x 3
8 z 4

Related

Initialise a dataframe where a column references another column

I wonder if there is a way to do:
df <- data.frame(x = 1:3)
df$y = df$x + 5
yielding:
x y
1 1 6
2 2 7
3 3 8
in one line of code where the y column refers to the x column? For example:
data.frame(x = 1:3, y = self$x + 5) # doesn't work
(I won't accept answers that ignore the x column, for example, data.frame(x = 1:3, y = 6:8 :-))
This is possible using tibble from tibble library. Credit to #DaveArmstrong from the comments.
library(tibble)
tibble(x = 1:3, y = x + 5)
# A tibble: 3 × 2
x y
<int> <dbl>
1 1 6
2 2 7
3 3 8
Here's a base R method that do not need to use external package (e.g. tibble).
We can use outer to add 5 to each element in df$x, then cbind the result with df.
setNames(data.frame(cbind(1:3, outer(1:3, 5, `+`))), c("x", "y"))
# or to expand your code
setNames(cbind(data.frame(x = 1:3), outer(1:3, 5, `+`)), c("x", "y"))
x y
1 1 6
2 2 7
3 3 8

Simultaneous Count and Sort in R

I am trying to obtain counts of a certain categorical variable in 2 separate columns, with each column reflecting the presence or an absence of an indicator variable. This is for a very large data frame. Here is an example data frame to further illustrate what I'm trying to do.
X <- (1:10)
Y <- c('a','b','a','c','b','b','a','a','c','c')
Z <- c(0,1,1,1,0,1,0,1,1,1)
test_df <- data.frame(X,Y,Z)
I would like to make a new DF grouped by 'a','b', and 'c' with 2 columns to the right, one with counts of the letter for Z==1 and the a count of that letter for Z==0.
The dplyr way:
library(dplyr)
library(tidyr)
#Code
res <- test_df %>% group_by(Y,Z) %>% summarise(N=n()) %>%
pivot_wider(names_from = Z,values_from=N,
values_fill = 0)
Output:
# A tibble: 3 x 3
# Groups: Y [3]
Y `0` `1`
<chr> <int> <int>
1 a 2 2
2 b 1 2
3 c 0 3
We can use values_fn in pivot_wider to do this in a single step
library(dplyr)
library(tidyr)
test_df %>%
pivot_wider(names_from = Z, values_from = X,
values_fn = length, values_fill = 0)
# A tibble: 3 x 3
# Y `0` `1`
# <chr> <int> <int>
#1 a 2 2
#2 b 1 2
#3 c 0 3
A base R option using aggregate + reshape
replace(
u <- reshape(
aggregate(X ~ ., test_df, length),
idvar = "Y",
timevar = "Z",
direction = "wide"
),
is.na(u),
0
)
giving
Y X.0 X.1
1 a 2 2
2 b 1 2
5 c 0 3
One way with data.table:
library(data.table)
setDT(test_df)
test_df[ , z1 := sum(Z==1), by=Y]
test_df[ , z0 := sum(Z==0), by=Y]
In base R you can use table :
table(test_df$Y, test_df$Z)
# 0 1
# a 2 2
# b 1 2
# c 0 3

Concatenate rows and columns

I have a data set like this
x y z
a 5 4
b 1 2
And i want concat columns and rows :
ay 5
az 4
by 1
bz 2
Thanks
You can use melt, and paste but you will need to make your rownames a variable, i..e
df$new <- rownames(df)
m_df <- reshape2::melt(df)
rownames(m_df) <- paste0(m_df$new, m_df$variable)
m_df <- m_df[-c(1:2)]
m_df
# value
#ax 5
#bx 1
#ay 4
#by 2
#az 3
#bz 1
After your edit, you don't need to convert rownames to a variable so just,
m1_df <- reshape2::melt(df)
m1_df$new <- paste0(m1_df$x, m1_df$variable)
m1_df
# x variable value new
#1 a y 5 ay
#2 b y 1 by
#3 a z 4 az
#4 b z 2 bz
You can then tidy your data frame to required output
with dplyr-tidyr
library(dplyr)
library(tidyr)
df %>%
gather(var, val, -x) %>%
mutate(var=paste0(x, var)) %>%
select(var, val)%>%
arrange(var)
# var val
#1 ay 5
#2 az 4
#3 by 1
#4 bz 2
library(reshape2)
library(dplyr)
library(tibble)
library(stringr)
# Create dataframe
x <- data.frame(x = c(5, 1),
y = c(4, 2),
z = c(3, 1),
row.names = c('a', 'b'))
# Convert rowname to column and melt
x <- tibble::rownames_to_column(x, "rownames") %>%
melt('rownames')
# assign concat columns as rownames
row.names(x) <- str_c(x$rownames, x$variable)
# Select relevant columns only
x <- select(x, value)
# Remove names from dataframe
names(x) <- NULL
> x
ax 5
bx 1
ay 4
by 2
az 3
bz 1
Here is another option in base R
stack(setNames(as.list(unlist(df1[-1])), outer(df1$x, names(df1)[-1], paste0)))[2:1]

Filter by group max combination of values in a given order

I would like to filter by groups, the maximal combination of values based on a given order of columns.
A vector of column should specify the order of columns in which looking at maximal values.
For example :
x <- data.frame(id = c("a", "a", "b", "b"),
x = c(1, 1, 1, 2),
y = c(1, 2, 2, 1),
z = c(1, 1, 2, 1))
> x
id x y z
1 a 1 1 1
2 a 1 2 1
3 b 1 2 2
4 b 2 1 1
In this example I would like to group by id and set the 'priority' to x, y, z which means that I want to look the maximal x value, then it's associated maximal y value and then the maximal z value for the maximal x, y couple.
I'm not aware of such a vectorized function so I reccursively group to find the maximum following column maximal value :
> x
id x y z
1 a 1 2 1
2 b 2 1 1
I can do it with base R, with a loop :
group <- "id"
cols <- c("x", "y", "z")
for (i in seq_along(cols)) {
tmp <- aggregate(setNames(list(x[[cols[i]]]), cols[i]), by = as.list(x[group]), FUN = max)
x <- merge(x, tmp, by = c(group, cols[i]))
group <- c(group, cols[i])
}
x <- x[!duplicated(x), ]
> x
id x y z
1 a 1 2 1
2 b 2 1 1
I would like to apply this to larger amount of data, so this code will struggle at some point. Do you have any ideas to improve this ?
Thank you for any help !
We can try with dplyr
library(dplyr)
x %>%
group_by(id) %>%
arrange(desc(y),desc(z)) %>%
slice(which.max(x))
# id x y z
# <fctr> <dbl> <dbl> <dbl>
#1 a 1 2 1
#2 b 2 1 1
Here is a base R solution using the split-apply-combine methodology.
dfNew <- do.call(rbind, lapply(split(x, x$id),
function(x) x[with(x, order(x, y, z, decreasing=TRUE))[1],]))
which returns
dfNew
id x y z
a a 1 2 1
b b 2 1 1
split splits the dataframe by id and returns a list, This list is fed to lapply which then applies an anonymous function that returns the row with the maximum values according to order. Finally, the list of single row data.frames are appended with rbind and do.call.

Bind data frames on longer identifiers R

I've got two data frames in which the unique identifiers common to both frames differ in the number of observations. I would like to create a dataframe from both in which the observations from each frame are taken if they have more observations for a common identifier. For example:
f1 <- data.frame(x = c("a", "a", "b", "c", "c", "c"), y = c(1,1,2,3,3,3))
f2 <- data.frame(x = c("a","b", "b", "c", "c"), y = c(4,5,5,6,6))
I would like this to generate a merge based on the longer x such that it produces:
x y
a 1
a 1
b 5
b 5
c 3
c 3
c 3
Any and all thoughts would be great.
Here's a solution using split
dd<-rbind(cbind(f1, s="f1"), cbind(f2, s="f2"))
keep<-unsplit(lapply(split(dd$s, dd$x), FUN=function(x) {
y<-table(x)
x == names(y[which.max(y)])
}), dd$x)
dd <- dd[keep,]
Normally i'd prefer to use the ave function here but because i'm changing data.types from a factor to a logical, it wasn't as appropriate so I basically copied the idea that ave uses and used split.
dplyr solution
library(dplyr)
First we combine the data:
with rbind() and introduce a new variable called ref to know where each observation came from:
both <- rbind( f1, f2 )
both$ref <- rep( c( "f1", "f2" ) , c( nrow(f1), nrow(f2) ) )
then count the observations:
make another new variable that contains how many observations for each ref and x combination:
both_with_counts <- both %>%
group_by( ref ,x ) %>%
mutate( counts = n() )
then filter for the largest count:
both_with_counts %>% group_by( x ) %>% filter( n==max(n) )
note: you could also select only the x and y cols with select(x,y)...
this gives:
## Source: local data frame [7 x 4]
## Groups: x
##
## x y ref counts
## 1 a 1 f1 2
## 2 a 1 f1 2
## 3 c 3 f1 3
## 4 c 3 f1 3
## 5 c 3 f1 3
## 6 b 5 f2 2
## 7 b 5 f2 2
Altogether now...
what_I_want <-
rbind(cbind(f1,ref = "f1"),cbind(f2,ref = "f2")) %>%
group_by(ref,x) %>%
mutate(counts = n()) %>%
group_by( x ) %>%
filter( counts==max(counts) ) %>%
select( x, y )
and thus:
> what_I_want
# Source: local data frame [7 x 2]
# Groups: x
#
# x y
# 1 a 1
# 2 a 1
# 3 c 3
# 4 c 3
# 5 c 3
# 6 b 5
# 7 b 5
Not a elegant answer but still give the desired result. Hope this help.
f1table <- data.frame(table(f1$x))
colnames(f1table) <- c("x","freq")
f1new <- merge(f1,f1table)
f2table <- data.frame(table(f2$x))
colnames(f2table) <- c("x","freq")
f2new <- merge(f2,f2table)
table <- rbind(f1table, f2table)
table <- table[with(table, order(x,-freq)), ]
table <- table[!duplicated(table$x), ]
data <-rbind(f1new, f2new)
merge(data, table, by=c("x","freq"))[,c(1,3)]
x y
1 a 1
2 a 1
3 b 5
4 b 5
5 c 3
6 c 3
7 c 3

Resources