How can I associate names from first column with repeating rows? - r

I have say this dataframe:
gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
0 0 5
1 2 3
and I would like to associate numbers from "unnamed" columns with genes to have the following:
gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
gene1 0 0 5
gene0 1 2 3
What is the best way to do it? Do I need to use linux or R for that?

One dplyr and tidyr option could be:
df %>%
group_by_at(-1) %>%
fill(V1)
V1 V2 V3 V4
<chr> <int> <int> <int>
1 gene0 1 2 3
2 gene1 0 0 5
3 gene2 6 8 0
4 gene3 5 5 5
5 gene1 0 0 5
6 gene0 1 2 3
Or:
df %>%
group_by(group = group_indices(., !!!select(., -1))) %>%
fill(V1) %>%
ungroup() %>%
select(-group)
Sample data:
df <- read.table(text = "gene0 1 2 3
gene1 0 0 5
gene2 6 8 0
gene3 5 5 5
NA 0 0 5
NA 1 2 3",
header = FALSE,
na.strings = "NA",
stringsAsFactors = FALSE)

We can use match from base R
a1 <- do.call(paste, df1[-1])
df1$V1 <- df1$V1[match(a1, unique(a1))]
df1$V1
#[1] "gene0" "gene1" "gene2" "gene3" "gene1" "gene0"
Update
Using the OP's dataset
df1 <- read.csv("newest.csv", stringsAsFactors = FALSE)
df1$id[df1$id == ""] <- NA
a1 <- do.call(paste, df1[-1])
df1$id <- df1$id[match(a1, unique(a1))]
length(unique(df1$id))
#[1] 621
head(df1$id, 20)
#[1] "pop13_110" "pop1_2" "pop16_108" "pop2_10" "pop2_2" "pop2_3" "pop2_4" "pop2_5" "pop2_6" "pop2_7" "pop2_8"
#[12] "pop2_9" "pop2_10" "pop2_11" "pop7_81" "pop2_13" "pop2_15" "pop2_15" "pop2_16" "pop22_20"
tail(df1$id, 20)
# [1] "pop22_2" "pop22_3" "pop22_4" "pop22_5" "pop22_8" "pop22_9" "pop13_60" "pop16_131" "pop23_11" "pop22_25" "pop22"
#[12] "pop22_14" "pop22_15" "pop22_32" "pop22_28" "pop16_56" "pop22_18" "pop9_9" "pop22_21" "pop22_22"
data
df1 <- structure(list(V1 = c("gene0", "gene1", "gene2", "gene3", NA,
NA), V2 = c(1L, 0L, 6L, 5L, 0L, 1L), V3 = c(2L, 0L, 8L, 5L, 0L,
2L), V4 = c(3L, 5L, 0L, 5L, 5L, 3L)), class = "data.frame",
row.names = c(NA,
-6L))

The naive solution
library(tidyverse)
df <- tribble(~col1,~col2,~col3,
1,2,3,
0,0,5,
6,8,0,
5,5,5,
0,0,5,
1,2,3,
1,1,1)
df %>%
mutate(gene = case_when(col1 == 1 & col2 == 2 &col3 == 3 ~ "gene0",
col1 == 0 & col2 == 0 &col3 == 5 ~ "gene1",
col1 == 6 & col2 == 8 &col3 == 0 ~ "gene2",
col1 == 5 & col2 == 5 &col3 == 5 ~ "gene3",
TRUE ~ "unkown_gene"))
Another much more extensible option is to create a tabble with gene definitions(can even be imported from excel or something)
df1 <- tribble(~gene,~col1,~col2,~col3,
'gene0',1,2,3,
'gene1',0,0,5,
'gene2',6,8,0,
'gene3',5,5,5)
and simply join the new observation on it
df %>%
left_join(df1)

Here is another solution with base R in addition to the solution by #akrun, where match() is used over rows that cross from columns from V2 to V4
df$V1[which(is.na(df$V1))] <- df$V1[match(data.frame(t(subset(df,is.na(df$V1))[-1])),
data.frame(t(subset(df,is.na(df$V1))[-1])))]
such that
> df
V1 V2 V3 V4
1 gene0 1 2 3
2 gene1 0 0 5
3 gene2 6 8 0
4 gene3 5 5 5
5 gene0 0 0 5
6 gene1 1 2 3

Related

Creating duplicated data frames with different ID

I have a question for the community and hoping for some help.
I am trying to duplicate a data frame like the one below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
I am trying to duplicate the above data frame 100 times so, it would read as below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
2 0 1
2 2 2
2 4 3
2 6 1
3 0 1
3 2 2
3 4 3
3 6 1
4 0 1
4 2 2
4 4 3
4 6 1
.....
100 0 1
100 2 2
100 4 3
100 6 1
Does anyone have a good solution for this or a resource to read up on this?
Thanks!
We can use replicate
out <- do.call(rbind, replicate(100, df1, simplify = FALSE))
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or another option is rep
out <- df1[rep(seq_len(nrow(df1)), 100),]
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or make use of uncount
library(tidyr)
library(dplyr)
uncount(df1, 100) %>%
mutate(ID = as.integer(gl(n(), nrow(df1), n()))
Or another option is
df1 %>%
nest_by(ID) %>%
uncount(100) %>%
mutate(ID = row_number()) %>%
unnest(c(data))
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L), Time = c(0L, 2L, 4L, 6L
), Solve = c(1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-4L))

Get difference with previous value and with next value

I have a dataframe like this. A small sample actually the df is bigger:
LOW 1 4 NA
MID 3 4 4
HIG 2 5 4
And would like to get the difference for LOW and HIG with MID so the ending df would be like this:
LOW 2 0 NA
MID 3 4 4
HIG 1 1 0
So you're getting: LOW = 3 - 1 = 2 and HIG = 3 - 2 = 1. I cand do it via VBA macros but want to scale with R.
It can be done with mutate_if/mutate_at
library(dplyr)
df1 %>%
mutate_if(is.numeric, ~ case_when(grp != 'MID' ~
abs(. - .[grp == 'MID']), TRUE ~ .))
# grp v1 v2 v3
#1 LOW 2 0 NA
#2 MID 3 4 4
#3 HIG 1 1 0
Or in base R
i1 <- df1$grp == 'MID'
df1[!i1, -1] <- abs(df1[!i1, -1] - rep(unlist(df1[i1, -1]), each = sum(!i1)))
data
df1 <- structure(list(grp = c("LOW", "MID", "HIG"), v1 = c(1L, 3L, 2L
), v2 = c(4L, 4L, 5L), v3 = c(NA, 4L, 4L)), class = "data.frame", row.names = c(NA,
-3L))
You can change the 'LOW', 'HIG' rows after subtracting by 'MID' :
df1[df1$grp == 'LOW', -1] <- abs(df1[df1$grp == 'MID',-1]- df1[df1$grp == 'LOW',-1])
df1[df1$grp == 'HIG', -1] <- abs(df1[df1$grp == 'MID',-1]- df1[df1$grp == 'HIG',-1])
df1
# grp v1 v2 v3
#1 LOW 2 0 NA
#2 MID 3 4 4
#3 HIG 1 1 0

Repeat a value within each ID

I have a dataset in R in long format. Each ID does not appear the same number of times (i.e. one ID might be one row, another might appear 79 rows).
e.g.
ID V1 V2
1 B 0
1 A 1
1 C 0
2 C 0
3 A 0
3 C 0
I want to create a variable which, if any of the rows for a given ID have Var2 == 1, then 1 repeats for every row of that ID
e.g.
ID V1 V2 V3
1 B 0 1
1 A 1 1
1 C 0 1
2 C 0 0
3 A 0 0
3 C 0 0
In base R we can use any - and ave for the grouping.
DF$V3 <- with(DF, ave(V2, ID, FUN = function(x) any(x == 1)))
DF
# ID V1 V2 V3
#1 1 B 0 1
#2 1 A 1 1
#3 1 C 0 1
#4 2 C 0 0
#5 3 A 0 0
#6 3 C 0 0
data
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), V1 = c("B", "A",
"C", "C", "A", "C"), V2 = c(0L, 1L, 0L, 0L, 0L, 0L)), .Names = c("ID",
"V1", "V2"), class = "data.frame", row.names = c(NA, -6L))
Here's a tidyverse solution.
If V2 can only be 0 or 1:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(V3 = max(V2))
If you want to check that V2 is exactly 1.
df %>%
group_by(ID) %>%
mutate(V3 = as.numeric(any(V2 == 1)))
Another base R option is
df$V3 <- with(df, +(ID %in% which(rowsum(V2, ID) > 0)))

extract the row having column values length equal to 1 in R

my input data is
df
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
4 0 1 1 1 0 1
I want output like this
mydata
anger sad joy happy trust disgust col
1 1 0 1 2 3 0 trust
2 2 0 0 2 0 3 disgust
I want to extract max value colname from each row but output only those rows having only one max value colname and discard all other row with more than one colname.
i tried this
d1 <- df[!apply(df[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
but i am getting this
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
I don't want third row in the output.
Thanks for help in advance.
We can use max.col to get the index of columns for each row after subsetting the rows
d1 <- mydata[!apply(mydata[-1], 1, anyDuplicated),]
d1$out <- names(d1)[-1][max.col(d1[-1], 'first')]
d1
# zone_id v1 v2 v3 v4 out
#1 1 12 15 18 20 v4
#3 3 31 28 14 2 v1
#4 4 12 16 9 5 v2
#5 5 5 18 10 12 v2
Update
If the OP wanted to remove only the duplicate values of max values, then replace the first line with
d1 <- mydata[!apply(mydata[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
Update2
Based on the newdataset by the OP, we don't need to remove the first column as it is not an id column
d2 <- mydata1[!apply(mydata1, 1, function(x) anyDuplicated(x[x == max(x)])),]
d2$out <- names(d2)[max.col(d2, 'first')]
d2
# anger sad joy happy trust disgust out
#1 1 0 1 2 3 0 trust
#2 2 0 0 2 0 3 disgust
data
mydata1 <- structure(list(anger = c(1L, 2L, 2L, 0L), sad = c(0L, 0L, 2L,
1L), joy = c(1L, 0L, 1L, 1L), happy = c(2L, 2L, 1L, 1L), trust = c(3L,
0L, 1L, 0L), disgust = c(0L, 3L, 1L, 1L)), .Names = c("anger", "sad",
"joy", "happy", "trust", "disgust"), row.names = c(NA, 4L),
class = "data.frame")
you can try:
mydata %>%
select(-zone_id) %>%
mutate(mx = do.call(pmax, (.))) %>%
select(mx) %>%
cbind(mydata) %>%
mutate( flg = rowSums(. == mx)) %>%
filter(flg ==2) %>%
select(-flg) %>%
gather(key = out, value= v, -mx, -zone_id) %>%
filter(mx == v) %>%
select(zone_id, mx, out) %>%
left_join(mydata)
which gives:
zone_id mx out v1 v2 v3 v4
1 3 31 v1 31 28 2 2
2 4 16 v2 1 16 9 1
3 5 18 v2 5 18 10 12
4 1 20 v4 12 15 18 20

How to combine two rows in R?

I would like to combine/sum two rows based on rownames to make one row in R. The best route might be to create a new row and sum the two rows together.
Example df:
A 1 3 4 6
B 3 2 7 9
C 6 8 1 2
D 3 2 8 9
Where A,B,C,D are rownames, I want to combine/sum two rows (A & C) into one to get:
A+C 7 11 5 8
B 3 2 7 9
D 3 2 8 9
Thank you.
aggregate to the rescue:
aggregate(df, list(Group=replace(rownames(df),rownames(df) %in% c("A","C"), "A&C")), sum)
# Group V2 V3 V4 V5
#1 A&C 7 11 5 8
#2 B 3 2 7 9
#3 D 3 2 8 9
You can replace the A row using the standard addition arithmetic operator, and then remove the C row with a logical statement.
df["A", ] <- df["A", ] + df["C", ]
df[rownames(df) != "C", ]
# V2 V3 V4 V5
# A 7 11 5 8
# B 3 2 7 9
# D 3 2 8 9
For more than two rows, you can use colSums() for the addition. This presumes the first value in nm is the one we are replacing/keeping.
nm <- c("A", "C")
df[nm[1], ] <- colSums(df[nm, ])
df[!rownames(df) %in% nm[-1], ]
I'll leave it up to you to change the row names. :)
Data:
df <- structure(list(V2 = c(1L, 3L, 6L, 3L), V3 = c(3L, 2L, 8L, 2L),
V4 = c(4L, 7L, 1L, 8L), V5 = c(6L, 9L, 2L, 9L)), .Names = c("V2",
"V3", "V4", "V5"), class = "data.frame", row.names = c("A", "B",
"C", "D"))
matrix multiply?
> A <- matrix(c(1,0,0,0,1,0,1,0,0,0,0,1), 3)
> A
[,1] [,2] [,3] [,4]
[1,] 1 0 1 0
[2,] 0 1 0 0
[3,] 0 0 0 1
> A %*% X
V2 V3 V4 V5
[1,] 7 11 5 8
[2,] 3 2 7 9
[3,] 3 2 8 9
Or using the Matrix package for sparse matrices:
fac2sparse(factor(c(1,2,1,4))) %*% X

Resources