dummy variable based on unique column interaction [duplicate] - r

This question already has answers here:
Pasting elements of two vectors alphabetically
(5 answers)
Closed 2 years ago.
I have the following data and wish to create an $ID variable for each unique interaction between two columns
DATE <- c('V', 'V', 'W', 'W', 'X', 'X', 'Y', 'Y', 'Z', 'Z')
SEX <- rep(1:2, 5)
Blood_T1 <- c(3,4,3,3,4,3,1,6,3,4)
Blood_T2 <- c(4,3,3,3,3,4,6,1,4,3)
df1 <- data.frame(DATE, SEX, Blood_T1, Blood_T2)
When grouping by $DATE, I want to create a new dummy variable for each unique combination of $Blood_T1 and $Blood_T2 regardless of their order.
The desired out appears below:
I cant use the sum, as it does not always produce unique combinations. (See the part marked in yellow above for clarification)
I have tried the following commands but have not yet hit the nail on the head
with(df1, interaction(Blood_T1, Blood_T2))
as.numeric(as.factor(with(df1, paste(Blood_T1, Blood_T2))))
transform(df1, Cluster_ID = as.numeric(interaction(Blood_T1, Blood_T2, drop=TRUE)))

You can actually sort the individual pairs ($Blood_T1 and $Blood_T2) and paste them together which is already a kind of ID
apply(df1, 1, function(x) paste(sort(x[3:4]), collapse = ""))
#[1] "34" "34" "33" "33" "34" "34" "16" "16" "34" "34"
If you want to further reduce it, you can treat it as a factor and obtain the numeric value
as.numeric(as.factor(apply(df1, 1, function(x) paste(sort(x[3:4]), collapse = ""))))
#[1] 3 3 2 2 3 3 1 1 3 3
You could throw in DATE too, if that is necessary
apply(df1, 1, function(x) paste(sort(x[c(1,3:4)]), collapse = ""))
#[1] "34V" "34V" "33W" "33W" "34X" "34X" "16Y" "16Y" "34Z" "34Z"

We can try with data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), get the pmin and pmax of the 'Blood_T1' and 'Blood_T2' columns, paste, it together, match the values with the unique elements to create the 'Unique_ID', then we group by 'DATE' and concatenate the sum of 'Blood_T1' and 'Blood_T2' to create the 'Sum' column
library(data.table)
setDT(df1)[, Unique_ID := {
i1 <- paste(pmin(Blood_T1, Blood_T2), pmax(Blood_T1, Blood_T2))
match(i1, unique(i1))}]
df1[, Sum := c(sum(Blood_T1), sum(Blood_T2)), DATE][]
# DATE SEX Blood_T1 Blood_T2 Unique_ID Sum
#1: V 1 3 4 1 7
#2: V 2 4 3 1 7
#3: W 1 3 3 2 6
#4: W 2 3 3 2 6
#5: X 1 4 3 1 7
#6: X 2 3 4 1 7
#7: Y 1 1 6 3 7
#8: Y 2 6 1 3 7
#9: Z 1 3 4 1 7
#10: Z 2 4 3 1 7
The above can be also implemented in base R i.e. vectorized approach.
i1 <- with(df1, paste(pmin(Blood_T1, Blood_T2), pmax(Blood_T1, Blood_T2)))
df1$Unique_ID <- match(i1, unique(i1))

Related

Duplicating R dataframe vector values using another vector as a guide

I have the following R dataframe: df = data.frame(value=c(5,4,3,2,1), a=c(2,0,1,6,9), b=c(7,0,0,3,4)). I would like to duplicate the values of a and b by the number of times of the corresponding position values in value. For example, Expanding b would look like b_ex = c(7,7,7,7,7,2,2,2,4). No values of three or four would be in b_ex because values of zero are in b[2] and b[3]. The expanded vectors would be assigned names and be stand-alone.
Thanks!
Maybe you are looking for :
result <- lapply(df[-1], function(x) rep(x[x != 0], df$value[x != 0]))
#$a
#[1] 2 2 2 2 2 1 1 1 6 6 9
#$b
#[1] 7 7 7 7 7 3 3 4
To have them as separate vectors in global environment use list2env :
list2env(result, .GlobalEnv)

Transform table [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 4 years ago.
I would like to repeat entire rows in a data-frame based on the samples column.
My input:
df <- 'chr start end samples
1 10 20 2
2 4 10 3'
df <- read.table(text=df, header=TRUE)
My expected output:
df <- 'chr start end samples
1 10 20 1-10-20-s1
1 10 20 1-10-20-s2
2 4 10 2-4-10-s1
2 4 10 2-4-10-s2
2 4 10 2-4-10-s3'
Some idea how to perform it wisely?
We can use expandRows to expand the rows based on the value in the 'samples' column, then convert to data.table, grouped by 'chr', we paste the columns together along with sequence of rows using sprintf to update the 'samples' column.
library(splitstackshape)
setDT(expandRows(df, "samples"))[,
samples := sprintf("%d-%d-%d-%s%d", chr, start, end, "s",1:.N) , chr][]
# chr start end samples
#1: 1 10 20 1-10-20-s1
#2: 1 10 20 1-10-20-s2
#3: 2 4 10 2-4-10-s1
#4: 2 4 10 2-4-10-s2
#5: 2 4 10 2-4-10-s3
NOTE: data.table will be loaded when we load splitstackshape.
You can achieve this using base R (i.e. avoiding data.tables), with the following code:
df <- 'chr start end samples
1 10 20 2
2 4 10 3'
df <- read.table(text = df, header = TRUE)
duplicate_rows <- function(chr, starts, ends, samples) {
expanded_samples <- paste0(chr, "-", starts, "-", ends, "-", "s", 1:samples)
repeated_rows <- data.frame("chr" = chr, "starts" = starts, "ends" = ends, "samples" = expanded_samples)
repeated_rows
}
expanded_rows <- Map(f = duplicate_rows, df$chr, df$start, df$end, df$samples)
new_df <- do.call(rbind, expanded_rows)
The basic idea is to define a function that will take a single row from your initial data.frame and duplicate rows based on the value in the samples column (as well as creating the distinct character strings you're after). This function is then applied to each row of your initial data.frame. The output is a list of data.frames that then need to be re-combined into a single data.frame using the do.call pattern.
The above code can be made cleaner by using the Hadley Wickham's purrr package (on CRAN), and the data.frame specific version of map (see the documentation for the by_row function), but this may be overkill for what you're after.
Example using DataFrame function from S4Vector package:
df <- DataFrame(x=c('a', 'b', 'c', 'd', 'e'), y=1:5)
rep(df, df$y)
where y column represents the number of times to repeat its corresponding row.
Result:
DataFrame with 15 rows and 2 columns
x y
<character> <integer>
1 a 1
2 b 2
3 b 2
4 c 3
5 c 3
... ... ...
11 e 5
12 e 5
13 e 5
14 e 5
15 e 5

What is the most effective way to sort dataframe and add special id? [duplicate]

I would like to create a numeric indicator for a matrix such that for each unique element in one variable, it creates a sequence of the length based on the element in another variable. For example:
frame<- data.frame(x = c("a", "a", "a", "b", "b"), y = c(3,3,3,2,2))
frame
x y
1 a 3
2 a 3
3 a 3
4 b 2
5 b 2
The indicator, z, should look like this:
x y z
1 a 3 1
2 a 3 2
3 a 3 3
4 b 2 1
5 b 2 2
Any and all help greatly appreciated. Thanks.
No ave?
frame$z <- with(frame, ave(y,x,FUN=seq_along) )
frame
# x y z
#1 a 3 1
#2 a 3 2
#3 a 3 3
#4 b 2 1
#5 b 2 2
A data.table version could be something like below (thanks to #mnel):
#library(data.table)
#frame <- as.data.table(frame)
frame[,z := seq_len(.N), by=x]
My original thought was to use:
frame[,z := .SD[,.I], by=x]
where .SD refers to each subset of the data.table split by x. .I returns the row numbers for an entire data.table. So, .SD[,.I] returns the row numbers within each group. Although, as #mnel points out, this is inefficient compared to the other method as the entire .SD needs to be loaded into memory for each group to run this calculation.
Another approach:
frame$z <- unlist(lapply(rle(as.numeric(frame[, "x"]))$lengths, seq_len))
library(dplyr)
frame %.%
group_by(x) %.%
mutate(z = seq_along(y))
You can split the data.frame on x, and generate a new id column based on that:
> frame$z <- unlist(lapply(split(frame, frame$x), function(x) 1:nrow(x)))
> frame
x y z
1 a 3 1
2 a 3 2
3 a 3 3
4 b 2 1
5 b 2 2
Or even more simply using data.table:
library(data.table)
frame <- data.table(frame)[,z:=1:nrow(.SD),by=x]
Try this where x is the column by which grouping is to be done and y is any numeric column. if there are no numeric columns use seq_along(x), say, in place of y:
transform(frame, z = ave(y, x, FUN = seq_along))

How to group by column?

I have a dataframe of student scores, instead of getting an overall average score for every student, I need to get the average scores by "course-type" for every student, for example, courses a,c,d are the same type, and courses b, e are the same type. I do this by the following code, but it is not "R" enough:
x <- data.frame(a=c(1,2,3), b=c(4,5,6), c=c(6,7,8),
d=c(7,8,9), e=c(10, 11, 12))
group <- data.frame(no=c(1,2,1,1,2), name=c("a", "b", "c", "d","e"))
> x
a b c d e
1 1 4 6 7 10
2 2 5 7 8 11
3 3 6 8 9 12
> group
no name
1 1 a
2 2 b
3 1 c
4 1 d
5 2 e
I think this is some stupid:
x.1 <- x[,as.character(group$name[group$no==1])]
x.2 <- x[,as.character(group$name[group$no==2])]
mean.by.no <- data.frame(x.1.mean=apply(x.1, 1, mean),
x.2.mean=apply(x.2, 1, mean))
If mean.by.no is the expected result, we could split the 'name' column by 'no' ('group' dataset) to get a list. Using one ofapply family functions (lapply/sapply/vapply), we can use the output as column index for the 'x', and get the mean for each row (rowMeans).
vapply(with(group, split(as.character(name), no)),
function(y) rowMeans(x[y]), numeric(nrow(x)))
# 1 2
#[1,] 4.666667 7
#[2,] 5.666667 8
#[3,] 6.666667 9
Or using tapply, we can get the mean using grouping index for row and column.
indx <- xtabs(no~name, group)[col(x)]
t(tapply(as.matrix(x), list(indx, row(x)), FUN=mean))
# 1 2
#1 4.666667 7
#2 5.666667 8
#3 6.666667 9
Or another option would be to convert the 'x' from 'wide' to 'long' format using melt from data.table after converting the 'data.frame' to 'data.table' (setDT). Set the key column as 'name' (setkey(..), and get the mean grouped by 'no' and 'rn' (row number column created by keep.rownames=TRUE). If needed, the output can be converted back to 'wide' format using dcast.
library(data.table)#v1.9.5+
dL <- setkey(melt(setDT(x, keep.rownames=TRUE), id.var='rn',
variable.name='name')[, name:= as.character(name)],
name)[group[2:1]][,mean(value) , by=list( no, rn)]
dcast(dL, rn~paste0('mean',no), value.var='V1')[,rn:=NULL][]
# mean1 mean2
#1: 4.666667 7
#2: 5.666667 8
#3: 6.666667 9
There's probably a more elegant way of this, but:
library(reshape)
library(plyr)
x <- data.frame(a=c(1,2,3), b=c(4,5,6), c=c(6,7,8), d=c(7,8,9), e=c(10, 11, 12))
group <- data.frame(no=c(1,2,1,1,2), name=c("a", "b", "c", "d","e"))
a<-melt(x)
names(a)<-c("name", "score")
b<-merge(a, group, by="name")
c<-ddply(b, c("no"), summarize, meanscore=mean(score))
c
> c
no meanscore
1 1 5.666667
2 2 8.000000

Rank each row in a data frame in descending order

I want to apply rank() to each row in a data frame by apply(data.frame,1,rank). However, rank is by default ascending. So when I apply rank() to my first row with the values (2,1,3,5), I get
[1] 2 1 3 4
However, I want
[1] 3 4 2 1
How can I do this using apply(data.frame,1,rank)?
Try
apply(-data, 1, rank, ties.method='first')
and compare with
apply(data, 1, rank, ties.method='first')
For your specific example
v1 <- c(2,1,3,5)
rank(v1)
#[1] 2 1 3 4
rank(-v1)
#[1] 3 4 2 1
data
set.seed(24)
data <- as.data.frame(matrix(sample(1:20, 4*20, replace=TRUE), ncol=4))

Resources