How to get all rows sharing same url into 1 row? - r

data frame after unseating has multiple rows with na values that can be summarized into one row. All text/character data.
Example:
link feature-1 feature-2 feature-3
link_1 a. NA NA
link_1. NA NA b
link_1. NA. c NA
link2 NA. a NA
link_2 NA NA d
link_2 x NA NA

Assuming that you are only ever combining NA values and text, then I recommend the following:
library(dplyr)
# here is a mock dataset
df = data.frame(grp = c('a','a','a','b','b','b'),
value1 = c(NA,NA,'text','text',NA,NA),
value2 = c(NA,'txt',NA,NA,'txt',NA),
stringsAsFactors = FALSE)
df %>%
# convert NA values to empty text strings
mutate(value1 = ifelse(is.na(value1), "", value1),
value2 = ifelse(is.na(value2), "", value2)) %>%
# specify the groups
group_by(grp) %>%
# append all the text in each group into a single row
summarise(val1 = paste(value1, collapse = ""),
val2 = paste(value2, collapse = ""))
Based on this answer.
Looking at the data in your question, you might need to first standardize some values. Because "link_1" vs "link_1." and "NA" vs "NA." will be treated as different.

You can use across to get first non-NA value by group in multiple columns.
library(dplyr)
df %>% group_by(link) %>% summarise(across(starts_with('feature'), ~na.omit(.)[1]))
# link feature.1 feature.2 feature.3
# <chr> <chr> <chr> <chr>
#1 link_1 a c b
#2 link_2 x a d
data
df <- structure(list(link = c("link_1", "link_1", "link_1", "link_2",
"link_2", "link_2"), feature.1 = c("a", NA, NA, NA, NA, "x"),
feature.2 = c(NA, NA, "c", "a", NA, NA), feature.3 = c(NA,
"b", NA, NA, "d", NA)), class = "data.frame", row.names = c(NA, -6L))

Related

How to find common and unique elements across multiple data-frames

Objective To see the common elements that is my row which are basically gene name in my different comparisons.
This was the answer which I tried to follow.
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene3', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
reduce(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
genes
1 gene3
This fails in this case
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene13', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
educe(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
[1] genes
<0 rows> (or 0-length row.names)
Now how to address this problem. I gave a small set I have like 15 comparison.
Expected output
gene3 df1 df2 df3 ## for common genes
gene1 df1 df2 ## for genes which arr not across all the combination
gene2
In the first case the solution works as the gene3 is preset in all the case but fails when it is present in only 2 condition.
So how do I find out all the possible combination where the genes are present in different possible combination.
For example if gene3 is present in all three so it is reported but gene1 and gene2 are present in df1 and df2 but these are not reported.
So I would like to see if a group of genes present in all condition which is not possible most likely but all the possible combination where its is present
My actual dataframes are named as such which is in a list
names(result_abd)
[1] "M0_vs_M1_TCGA_stages" "M0_vs_M2_TCGA_stages" "M0_vs_M3_TCGA_stages" "M0_vs_M4_TCGA_stages" "M0_vs_M5_TCGA_stages" "M1_vs_M2_TCGA_stages"
[7] "M1_vs_M3_TCGA_stages" "M1_vs_M4_TCGA_stages" "M1_vs_M5_TCGA_stages" "M2_vs_M3_TCGA_stages" "M2_vs_M4_TCGA_stages" "M2_vs_M5_TCGA_stages"
[13] "M3_vs_M4_TCGA_stages" "M3_vs_M5_TCGA_stages" "M4_vs_M5_TCGA_stages"
>
So I would have like 15 columns for each dataframe
I ran your code the output is as such
dput(head(a))
structure(list(gene = c("ENSG00000000003", "ENSG00000000971",
"ENSG00000002726", "ENSG00000003989", "ENSG00000005381", "ENSG00000006534"
), dfM0_vs_M1_TCGA_stages = c("M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages"), dfM0_vs_M2_TCGA_stages = c(NA, "M0_vs_M2_TCGA_stages",
"M0_vs_M2_TCGA_stages", NA, "M0_vs_M2_TCGA_stages", NA), dfM0_vs_M3_TCGA_stages = c("M0_vs_M3_TCGA_stages",
"M0_vs_M3_TCGA_stages", "M0_vs_M3_TCGA_stages", NA, "M0_vs_M3_TCGA_stages",
NA), dfM0_vs_M4_TCGA_stages = c("M0_vs_M4_TCGA_stages", NA, "M0_vs_M4_TCGA_stages",
NA, "M0_vs_M4_TCGA_stages", "M0_vs_M4_TCGA_stages"), dfM0_vs_M5_TCGA_stages = c("M0_vs_M5_TCGA_stages",
NA, "M0_vs_M5_TCGA_stages", NA, "M0_vs_M5_TCGA_stages", "M0_vs_M5_TCGA_stages"
), dfM1_vs_M2_TCGA_stages = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), dfM1_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M1_vs_M3_TCGA_stages", NA), dfM1_vs_M4_TCGA_stages = c(NA,
"M1_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM1_vs_M5_TCGA_stages = c(NA,
NA, "M1_vs_M5_TCGA_stages", NA, NA, NA), dfM2_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M2_vs_M3_TCGA_stages", NA), dfM2_vs_M4_TCGA_stages = c(NA,
"M2_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM2_vs_M5_TCGA_stages = c(NA,
NA, "M2_vs_M5_TCGA_stages", NA, "M2_vs_M5_TCGA_stages", NA),
dfM3_vs_M4_TCGA_stages = c(NA, "M3_vs_M4_TCGA_stages", NA,
NA, "M3_vs_M4_TCGA_stages", NA), dfM3_vs_M5_TCGA_stages = c(NA,
"M3_vs_M5_TCGA_stages", NA, NA, "M3_vs_M5_TCGA_stages", NA
), dfM4_vs_M5_TCGA_stages = c(NA, NA, "M4_vs_M5_TCGA_stages",
NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Dataframe format
A tibble: 6 × 16
gene dfM0_vs_M1_TCGA… dfM0_vs_M2_TCGA… dfM0_vs_M3_TCGA… dfM0_vs_M4_TCGA… dfM0_vs_M5_TCGA… dfM1_vs_M2_TCGA… dfM1_vs_M3_TCGA… dfM1_vs_M4_TCGA…
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ENSG00000000003 M0_vs_M1_TCGA_s… NA M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
2 ENSG00000000971 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… NA NA NA NA M1_vs_M4_TCGA_s…
3 ENSG00000002726 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
4 ENSG00000003989 M0_vs_M1_TCGA_s… NA NA NA NA NA NA NA
5 ENSG00000005381 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA M1_vs_M3_TCGA_s… NA
6 ENSG00000006534 M0_vs_M1_TCGA_s… NA NA M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
Now this is what i wanted. The next step as an example I would like to see
If I take this gene ENSG00000000971 is present in 7 comparison but not in others where its is reported as NA.How do I group them.
Like making another data frame with those genes lets say are present in multiple comparison and not include wherever this is NA
It's not clear to me exactly how you want your ourput formatted (several index columns or one column containing a string or a list column). But here's one option. I start by combining your list of data fames into a single data frame, with an index indicating the source.
library(tidyverse)
dfList <- list(df1, df2, df3)
dfList %>%
bind_rows(.id="df") %>%
pivot_wider(names_from=df, names_prefix="df", values_from=df)
# A tibble: 10 × 4
genes df1 df2 df3
<chr> <chr> <chr> <chr>
1 gene1 1 2 NA
2 gene3 1 2 3
3 gene4 1 NA 3
4 gene2 1 2 NA
5 gene5 NA 2 NA
6 genet NA 2 NA
7 gene6 NA NA 3
8 gdene7 NA NA 3
9 genex NA NA 3
10 gene10 NA NA 3
Addition in response to OP's question below. (Though note that's actually a new question and really should be a new post.)
dfList %>%
bind_rows(.id="df") %>%
group_by(genes) %>%
summarise(minDF=min(df), maxDF=max(df)) %>%
filter(minDF == maxDF & maxDF == 3) %>%
pull(genes)
[1] "gdene7" "gene10" "gene6" "genex"
Once again, the key is to put all the data into a single data frame. (And the desired format of the output is not clear.)

Wide to long with pivot_longer and mix of numeric and character data

help <- data.frame(
id = c(100, 100, 101, 102, 102),
q1 = c(NA, 1, NA, NA, 3),
q2 = c(1, NA, 2, NA, NA),
q3 = c(NA, 1, NA, 4, NA),
q4 = c(NA, NA, 4, NA, 5),
group = c("a", "b", "c", "a", "c"))
help$group <- as.character(help$group)
I am trying to pivot longer so dataset looks like this:
id score group
100 NA a
100 1 b
100 NA c
...
But I get an error with the numeric values of q1-q4 and the character string group.
pivot_longer(help, !id, names_to = "score",
values_to = "group", values_ptypes = list(group = 'character'))
Error: Can't convert <double> to <character>.
How can I pivot longer but also preserve the group variable (where there is several missing data for the q1-4 there is a match for every id and group)?
library(tidyr)
output <- pivot_longer(help, -c(id, group), names_to = "question",
values_to = "score") %>%
dplyr::select(-question) %>%
dplyr::arrange(id, group)
Output
head(output)
# A tibble: 6 × 3
id group score
<dbl> <chr> <dbl>
1 100 a NA
2 100 a 1
3 100 a NA
4 100 a NA
5 100 b 1
6 100 b NA

sum the column values(group_by) keeping NA values and not replacing with zero in R

I am trying to sum the column values group by another column, I need to keep NA values i should not replace the values with zero because based on the sum I have to give Rank if sum is NA rank should be empty.
below is example to under stand the problem
column1 column2 column3
a gb 10
b gb NA
c gb NA
d gb 4
e Hs 81
b Hs NA
c Hs 2
a Rd NA
x Rd NA
z Rd NA
I have to sum column3 values group by column2 and while doing sum I should not take NA values into consideration and sum other values for that group I should not remove or replace the NA values with zero. based on this sum I have to give rank if the sum is NA(refer group Rd) there will be no Rank for that. I can replace the NA values with zero but I have to give rank after sum, if sum is NA rank will be empty(in case of group Rd from the above data). and for group gb the sum value is 14 and group Hs sum value is 83 and group Rd sum value is NA in this case there will be no rank for the group.
below is the code snippet i tried
df %>% group_by_at(column2) %>%
summarise(sum = sum(column3, na.rm = TRUE))
above code sum the values by replacing all NA with zero but I don't want to replace. I need NA in giving rank. can you please provide any solution fro this.
expected output:
column2 column3 rank
gb 14 2
Hs 83 1
Rd NA No Rank
You could use rank with na.last = "keep" to give rank as NA
library(dplyr)
df %>%
group_by(column2) %>%
summarise(column3 = if(all(is.na(column3))) NA else
sum(column3, na.rm = TRUE)) %>%
ungroup %>%
mutate(rank = rank(-column3, na.last = "keep"))
# column2 column3 rank
# <fct> <int> <dbl>
#1 gb 14 2
#2 Hs 83 1
#3 Rd NA NA
We can use sum_ from hablar which would return NA if all the values are NA and then use dense_rank
library(dplyr)
library(hablar)
df %>%
group_by(column2) %>%
summarise(column3 = sum_(column3)) %>%
mutate(rank = dense_rank(-column3))
# A tibble: 3 x 3
# column2 column3 rank
# <chr> <int> <int>
#1 gb 14 2
#2 Hs 83 1
#3 Rd NA NA
Or using data.table
library(data.table)
setDT(df)[, .(column3 = sum_(column3)), column2][,
rank := frank(-column3, na.last = 'keep')][]
data
df <- structure(list(column1 = c("a", "b", "c", "d", "e", "b", "c",
"a", "x", "z"), column2 = c("gb", "gb", "gb", "gb", "Hs", "Hs",
"Hs", "Rd", "Rd", "Rd"), column3 = c(10L, NA, NA, 4L, 81L, NA,
2L, NA, NA, NA)), class = "data.frame", row.names = c(NA, -10L
))
Base R solution:
within(aggregate(column3~column2, df, FUN = function(x){
ifelse(all(is.na(x)), NA_integer_, sum(x, na.rm = TRUE))},
na.action = na.pass), {rank = ifelse(is.na(column3), NA_integer_,
rank(-column3))})

How do I remove na in R and make below value to go up

I have a data frame like below:
how do I remove na and use below value to go up?
Thanks
id name.america name.europe name.asia
1 a <NA> <NA>
2 <NA> b <NA>
3 <NA> <NA> c
4 d <NA> <NA>
Change to:
id name.america name.europe name.asia
1 a b c
2 d
We can loop through the columns and remove the NA, then make the lengths of the list elements same by appending NA at the end after getting the max length of the list element. Based on that, subset the 'id' column of the dataset and append with the output
lst <- lapply(df1[-1], na.omit)
lst1 <- lapply(lst, `length<-`, max(lengths(lst)))
out <- data.frame(lst1)
out1 <- cbind(id = df1$id[seq_len(nrow(out))], out)
out1
# id name.america name.europe name.asia
#1 1 a b c
#2 2 d <NA> <NA>
If we need NA to be changed to blanks ("") - not recommended
out1[is.na(out1)] <- ""
data
df1 <- structure(list(id = 1:4, name.america = c("a", NA, NA, "d"),
name.europe = c(NA, "b", NA, NA), name.asia = c(NA, NA, "c",
NA)), class = "data.frame", row.names = c(NA, -4L))
tidyverse-based solution
require(tidyverse)
df1 %>%
gather(key = "name", value = "val", -id) %>%
na.omit() %>%
select(-id) %>%
group_by(name) %>%
mutate(id = 1:n()) %>%
spread(key = name, value = val)
Results
# A tibble: 2 x 4
id name.america name.asia name.europe
<int> <chr> <chr> <chr>
1 1 a c b
2 2 d NA NA
Notes
If desired you can re-order columns with select or that variable prior to transformation.
NAs are left as such. If desired, you can use tidyr::replace_na to insert some string or space. I would discourage you from doing that.
Data
Taken from #akrun's answer above.
df1 <- structure(
list(
id = 1:4,
name.america = c("a", NA, NA, "d"),
name.europe = c(NA, "b", NA, NA),
name.asia = c(NA, NA, "c",
NA)
),
class = "data.frame",
row.names = c(NA, -4L)
)
df1[, -1] <- lapply(df1[,-1], function(x) c(na.omit(x), rep("",length(x)-length(na.omit(x)))))
df1[1:max(colSums(!(df1[,-1]==""))),]
# id name.america name.europe name.asia
#1 1 a b c
#2 2 d

How to fill in NAs of various columns grouped by duplicated IDs in R

I have a table with columns id, colA, and colB. The data contains duplicated id columns where for some rows, colA or colB is null, but its duplicated id has valid values. I want to clean the data so that I remove duplicates, but have complete data. For example my data looks like
id | colA | colB
1 NA X
1 Y X
2 Z NA
2 Z Y
3 Z Y
3 Z Y
4 NA NA
4 NA NA
and I want my dataframe to look like
id | colA | colB
1 Y X
2 Z Y
3 Z Y
4 NA NA
I usually use the ifelse statement to replace missing values, but I am confused on how to use this in the context of having duplicated ids.
First add a column that tells how many NAs in each row. Then using dplyr, remove duplicated rows first and then for each id keep the row with least missing values -
df$test <- rowSums(is.na(df))
df %>%
filter(!duplicated(.)) %>%
arrange(id, test) %>%
group_by(id) %>%
filter(row_number() == 1) %>%
ungroup() %>%
select(-test)
# A tibble: 4 x 3
id colA colB
<int> <chr> <chr>
1 1 y x
2 2 z y
3 3 z y
4 4 <NA> <NA>
EDIT:
Actually no need to remove duplicates first. Just keeping the row with least missing values for each id should also work -
df$test <- rowSums(is.na(df))
df %>%
arrange(id, test) %>%
group_by(id) %>%
filter(row_number() == 1) %>%
ungroup() %>%
select(-test)
Data -
df <- data.frame(
id = c(rep(seq(1:4), each =2)), colA = c(NA, "y", "z", "z", "z", "z", NA, NA),
colB = c("x", "x", NA, "y", "y", "y", NA, NA), stringsAsFactors = F)
This answer is very dependent on your actual data being similar in structure to your example data.
Your data:
df1 <- structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L),
colA = c(NA, "Y", "Z", "Z", "Z", "Z", NA, NA),
colB = c("X", "X", NA, "Y", "Y", "Y", NA, NA)),
class = "data.frame",
row.names = c(NA, -8L))
Assuming, as in your example, that each id occurs twice and that where one observation is NA, it is the first observation for that id, then this works:
library(dplyr)
library(tidyr)
df1 %>%
group_by(id) %>%
fill(colA, colB, .direction = "up") %>%
ungroup() %>%
distinct()
# A tibble: 4 x 3
id colA colB
<int> <chr> <chr>
1 1 Y X
2 2 Z Y
3 3 Z Y
4 4 NA NA
If the second observation for an id can be NA, you could try adding a second fill after the first one, but this time fill down:
df1 %>%
group_by(id) %>%
fill(colA, colB, .direction = "up") %>%
fill(colA, colB, .direction = "down") %>%
ungroup() %>%
distinct()
Creating dataframe - it helps if you post the code to make the sample data
df <- data.frame(id = c(rep(seq(1:4), each =2)), colA = c(NA, "y", "z", "z", "z", "z", NA, NA), colB = c("x", "x", NA, "y", "y", "y", NA, NA))
Removing rows with single NAs
for(i in 1:nrow(df)){
if(is.na(df[i,]$colA) & !is.na(df[i,]$colB) | !is.na(df[i,]$colA) & is.na(df[i,]$colB)){
df <- df[-i,]
}
}
Removing remaining duplicates (i.e. double NA rows)
df <- df[!duplicated(df), ]
Output
df
Probably a more computationally efficient way of doing this but this ought to work.

Resources