From long to wide formats just based on two columns Rstudio - r

This is my data frame:
I have a data frame of six columns and last columns contains the values . The Column 'code' includes s and d. column 'Sex' includes M and F. And I have two thousand offsprings in the column offspring.
seq parent code Sex offspring Value
1 49032 s M J44010_CCG7YANXX_2_661_X4 -0.38455056
2 48741 s M J44010_CCG7YANXX_2_661_X4 0.10574340
3 48757 s M J44010_CCG7YANXX_2_661_X4 0.39572906
4 48465 d f J44010_CCG7YANXX_2_661_X4 0.43409006
5 48521 d f J44010_CCG7YANXX_2_661_X4 0.40337447
6 48703 d f J44010_CCG7YANXX_2_661_X4 -0.38148980
The column parent includes ids for both males and females.
I want to keep the female/dam id ,female/dam code and female/dam sex just beside the male/sire as a column and also keep the sire value and dam value seperately . So, the 'value' will be seprated in two parts .
The data frame will look like the below:
'seq''parent1''sirecode''Sex''parent2''damcode''Sex''offspring''sireValue' 'damvalue'
1 49032 s M 48465 d f J44010 -0.38455056 0.43409006
2 48741 s M 48521 d f J44010 0.10574340 0.40337447
3 48757 s M 48703 d f J44010 0.39572906 -0.38148980
So, each offspring will have 3 or 4 pair of parents.
I tried to use dcast function on it.

We could use dcast after creating a sequence column
library(data.table)
setDT(df1)[, n := seq_len(.N), .(code, Sex)]
dcast(df1, n + offspring ~ rowid(n), value.var = c('parent', 'code', 'Sex', 'Value'), sep = "")
# n offspring parent1 parent2 code1 code2 Sex1 Sex2 Value1 Value2
#1: 1 J44010_CCG7YANXX_2_661_X4 49032 48465 s d M f -0.3845506 0.4340901
#2: 2 J44010_CCG7YANXX_2_661_X4 48741 48521 s d M f 0.1057434 0.4033745
#3: 3 J44010_CCG7YANXX_2_661_X4 48757 48703 s d M f 0.3957291 -0.3814898
In base R, we can use reshape
df1$n <- with(df1, ave(seq_along(Sex), Sex, FUN = seq_along))
df1$n1 <- with(df1, ave(n, n, FUN = seq_along))
reshape(df1[-1], idvar = c('n', 'offspring'), timevar = 'n1', direction = 'wide' )
data
df1 <- structure(list(seq = 1:6, parent = c(49032L, 48741L, 48757L,
48465L, 48521L, 48703L), code = c("s", "s", "s", "d", "d", "d"
), Sex = c("M", "M", "M", "f", "f", "f"),
offspring = c("J44010_CCG7YANXX_2_661_X4",
"J44010_CCG7YANXX_2_661_X4", "J44010_CCG7YANXX_2_661_X4",
"J44010_CCG7YANXX_2_661_X4",
"J44010_CCG7YANXX_2_661_X4", "J44010_CCG7YANXX_2_661_X4"),
Value = c(-0.38455056,
0.1057434, 0.39572906, 0.43409006, 0.40337447, -0.3814898)),
class = "data.frame", row.names = c(NA, -6L))

Related

Pasting several values from a vector into a dataframe column

My dataframe "test" is like this:
a b c
d e f
I want to add strings to the 1st col so as to get this
a__3 b c
a__23 b c
a__45 b c
...
sb <- c(3, 23, 45)
datalist <- ""
for (i in 1:length(sb)) {
new <- apply(test[,1],1,paste0,collapse=("__" sb[i]))
datalist[i] <- new
}
I want to add rows into test df including all sb[i].
I have tried rbind, but does not get the correct result
An idea is to replicate the rows based on the length of your sb vector, do the paste and filter to keep only the ones you are interested in, i.e.
d3 <- d2[rep(rownames(d2), length(sb)),]
d3$V1[d3$V1 == 'a'] <- paste0(d3$V1[d3$V1 == 'a'], '__', sb)
d3[grepl('a', d3$V1),]
# V1 V2 V3
#1 a__3 b c
#1.1 a__23 b c
#1.2 a__45 b c
DATA
dput(d2)
structure(list(V1 = c("a", "d"), V2 = c("b", "e"), V3 = c("c",
"f")), row.names = c(NA, -2L), class = "data.frame")

Count variable in one data.table for each category in a different data.frame R

If I have a data.table:
a <- data.table("NAME" = c("A", "B", "A"),
"PASS_FAIL" = c("F", "P", "P"))
And a data.frame (which is a frequency table for number of times each unique NAME shows up in table a:
b <- aggregate(data.frame(Count = a$NAME), list(Name = a$NAME), length)
So b looks like this:
> b
Name Count
1 A 2
2 B 1
How do I now add a new column to table b that counts the number of F for each unique NAME? Expected output:
> b
Name Count FailCount
1 A 2 1
2 B 1 0
I know I need something to the effect of a$PASS_FAIL == "F".
With data.table, grouped by 'NAME', get the count of rows with .N and sum of the logical expression (PASS_FAIL == "F") for the number of fail cases
library(data.table)
a[, .(Count = .N, FailCount = sum(PASS_FAIL=="F")), NAME]
# NAME Count FailCount
#1: A 2 1
#2: B 1 0
If we need a base R method
merge(b, aggregate(cbind(FailCount = PASS_FAIL == "F") ~ NAME, a, sum),
by.x = 'Name', by.y = 'NAME')

Turn ordered pairs into unordered pairs in a data frame with dplyr

I have a data frame that looks like this:
library(dplyr)
df <- data_frame(doc.x = c("a", "b", "c", "d"),
doc.y = c("b", "a", "d", "c"))
So that df is:
Source: local data frame [4 x 2]
doc.x doc.y
(chr) (chr)
1 a b
2 b a
3 c d
4 d c
This is a list of ordered pairs, a to d but also d to a, and so on. What is a dplyr-like way to return only a list of unordered pairs in this data frame? I.e.
doc.x doc.y
(chr) (chr)
1 a b
2 c d
Use pmin and pmax to sort the pairs alphabetically, i.e. turn (b,a) into (a,b) and then filter away all the duplicates.
df %>%
mutate(dx = pmin(doc.x, doc.y), dy = pmax(doc.x, doc.y)) %>%
distinct(dx, dy) %>%
select(-dx, -dy)
doc.x doc.y
(chr) (chr)
1 a b
2 c d
Alternate way using data.table:
df <- data.frame(doc.x = c("a", "b", "c", "d"),
doc.y = c("b", "a", "d", "c"), stringsAsFactors = F)
library(data.table)
setDT(df)
df[, row := 1:nrow(df)]
df <- df[, list(Left = max(doc.x,doc.y),Right = min(doc.x,doc.y)), by = row]
df <- df[, list(Left,Right)]
unique(df)
Left Right
1: b a
2: d c
Using dplyr
# make character columns into factors
df <- as.data.frame(unclass(df))
df$x.lvl <- levels(df$doc.x)
df$y.lvl <- levels(df$doc.y)
# find unique pairs
res <- df %>%
group_by(doc.x) %>%
transform(x.lvl = order(doc.x),
y.lvl = order(doc.y)) %>%
transform(pair = ifelse(x.lvl < y.lvl,
paste(doc.x, doc.y, sep=","), paste(doc.y, doc.x, sep=","))) %>%
.$pair %>%
unique
Unique pairs
res
[1] a,b c,d
Levels: a,b c,d
Edit
Inspired by Backlin's solution, in base R
unique(with(df, paste(pmin(doc.x, doc.y), pmax(doc.x, doc.y), sep=","))
[1] "a,b" "c,d"
Or to store in a data.frame
unique(with(df, data.frame(lvl1=pmin(doc.x, doc.y), lvl2=pmax(doc.x, doc.y))))
lvl1 lvl2
1 a b
3 c d

Merge rows in same data frame based on common values

Most of the approaches I've come across involve using dplyr to apply a function when combining features, however, I would just like to restructure a single data frame without applying any function to each group.
I have a single data frame that looks like this:
gene_name chr nb_pos nb_ref nb_alt m_pos m_ref m_alt
ACAA1 3 38173733 C T 38144875 G T
ACAA1 3 38144875 G T 38144876 G A
I would like to combine each row with a common gene_name and chr, where each gene can have a variable amount of rows, to look like this:
gene_name chr np_pos1 nb_ref1 nb_alt1 nb_pos2 nb_ref2 nb_alt2 nb_alt2
ACAA1 3 38173733 C T 38144875 G T T
Does anyone know of a way to do this?
We can use dcast from the devel version of data.table i.e. v1.9.5. Instructions to install it are here.
Create a sequence column ('ind') based on the grouping columns ('gene_name', 'chr'), and then use dcast specifying the value.var columns.
library(data.table)
dcast(setDT(df1)[, ind:= 1:.N ,.(gene_name, chr)],
gene_name+chr~ind, value.var=names(df1)[3:8])
# gene_name chr 1_nb_pos 2_nb_pos 1_nb_ref 2_nb_ref 1_nb_alt 2_nb_alt 1_m_pos
#1: ACAA1 3 38173733 38144875 C G TRUE TRUE 38144875
# 2_m_pos 1_m_ref 2_m_ref 1_m_alt 2_m_alt
#1: 38144876 G G T A
Or using reshape from base R after we create the sequence column using ave.
df2 <- transform(df1, ind=ave(seq_along(gene_name),
gene_name, chr, FUN=seq_along))
reshape(df2, idvar=c('gene_name', 'chr'), timevar='ind',
direction='wide')
# gene_name chr nb_pos.1 nb_ref.1 nb_alt.1 m_pos.1 m_ref.1 m_alt.1 nb_pos.2
#1 ACAA1 3 38173733 C TRUE 38144875 G T 38144875
# nb_ref.2 nb_alt.2 m_pos.2 m_ref.2 m_alt.2
#1 G TRUE 38144876 G A
data
df1 <- structure(list(gene_name = c("ACAA1", "ACAA1"), chr = c(3L, 3L
), nb_pos = c(38173733L, 38144875L), nb_ref = c("C", "G"),
nb_alt = c(TRUE,
TRUE), m_pos = 38144875:38144876, m_ref = c("G", "G"), m_alt = c("T",
"A")), .Names = c("gene_name", "chr", "nb_pos", "nb_ref", "nb_alt",
"m_pos", "m_ref", "m_alt"), class = "data.frame",
row.names = c(NA, -2L))

Opposite of dcast [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 9 years ago.
The idea is to convert a frequency table to something geom_density can handle (ggplot2).
Starting with a frequency table
> dat <- data.frame(x = c("a", "a", "b", "b", "b"), y = c("c", "c", "d", "d", "d"))
> dat
x y
1 a c
2 a c
3 b d
4 b d
5 b d
Use dcast to make a frequency table
> library(reshape2)
> dat2 <- dcast(dat, x + y ~ ., fun.aggregate = length)
> dat2
x y count
1 a c 2
2 b d 3
How can this be reversed? melt does not seem to be the answer:
> colnames(dat2) <- c("x", "y", "count")
> melt(dat2, measure.vars = "count")
x y variable value
1 a c count 2
2 b d count 3
As you can use any aggregate function, you won't be able to reverse the dcast (aggregation) without knowing how to reverse the aggregation.
For length, the obvious inverse is rep. For aggregations like sum or mean there isn't an obvious inverse (that assumes you haven't saved the original data as an attribute)
Some options to invert length
You could use ddply
library(plyr)
ddply(dat2,.(x), summarize, y = rep(y,count))
or more simply
as.data.frame(lapply(dat2[c('x','y')], rep, dat2$count))

Resources