Extract only rows from a specific dataset in R? - r

I have a table which looks like this:
df1 <- data.frame(
"seqid" = c("12", "12", "13", "12", "12", "15"),
"source" = c("star", "star", "star", "star", "star", "star"),
"type" = c("CDS", "CDS", "CDS", "intron", "CDS", "intron"),
"start" = c("15", "21", "23", "35", "45", "60"),
"end" = c("70", "80", "86", "45", "67", "88"),
"attributes" = c("ENSOCUT00000011013", "ENSOCUT00000064484",
"ENSOCUT00000013302",
"ENSOCUT00000010968", "ENSOCUT00000010968", "ENSOCUT00000060283"),
stringsAsFactors = F,check.names=FALSE)
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
intron
35
45
ENSOCUT00000010968
12
star
CDS
45
67
ENSOCUT00000010968
12
star
intron
60
88
ENSOCUT00000060283
And I want to extract only rows 1, 2, 3, 5 to have a final result looking like this:
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
CDS
45
67
ENSOCUT00000010968

df1[c(1,2,3,5),]
In general to select numbered rows/columns in the brackets of a data.frame df:
df[rows_selected_go_here, columns_selected_go_here]

I assume that you only want those entries from df where Type (string) equals CDS
library(tidyverse)
df <- mutate(df, TOBINCL= 0)
df$TOBINCL[grepl("^CDS$", df$Type, ignore.case = TRUE)] <- 1
mynewdf <- df[(df$TOBINCL==1) , ]

Related

Convert long data to wide data with multiple columns and multiple values

I have a data frame of 300K rows with 2 main columns of interest. (NAME & SUBJCT) I need to convert this data into a wide format and in addition, if I get a records for a particular subject with multiple dates, I need to place them next to each other.
I tried using tidyr::pivot_wider but I'm not able to get it work.
Sample data:
DF <- data.frame(
NAME = c("ABC", "ABC", "DEF", "ABC", "ABC", "ABC", "DEF", "ABC", "DEF", "ABC", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "ABC"),
SUBJECT = c("MATHS", "LANGUAGE 1", "LANGUAGE 1", "LANGUAGE 2","LANGUAGE 2","LANGUAGE 2","LANGUAGE 2", "SCIENCE", "SCIENCE", "HISTORY", "PE", "ENVIRONMENT", "COMPUTERS", "COMPUTERS", "COMPUTERS", "BIOLOGY", "SANSKRIT"),
YEAR = c("2010", "2011", "2012", "2013", "2014", "2015", "2013", "2015", "2016", "2016", "2017", "2015", "2016", "2017", "2018", "2015", "2013"),
MARKS = c("45", "48", "47", "44", "48", "46", "42", "42", "43", "37", "42", "43", "42", "41", "44", "41", "44"),
MAXIMUM = c("46", rep("50", 5), "45", "50", rep("45", 9))
)
> DF
NAME SUBJECT YEAR MARKS MAXIMUM
1 ABC MATHS 2010 45 46
2 ABC LANGUAGE 1 2011 48 50
3 DEF LANGUAGE 1 2012 47 50
4 ABC LANGUAGE 2 2013 44 50
5 ABC LANGUAGE 2 2014 48 50
6 ABC LANGUAGE 2 2015 46 50
7 DEF LANGUAGE 2 2013 42 45
8 ABC SCIENCE 2015 42 50
9 DEF SCIENCE 2016 43 45
10 ABC HISTORY 2016 37 45
11 DEF PE 2017 42 45
12 DEF ENVIRONMENT 2015 43 45
13 DEF COMPUTERS 2016 42 45
14 DEF COMPUTERS 2017 41 45
15 DEF COMPUTERS 2018 44 45
16 DEF BIOLOGY 2015 41 45
17 ABC SANSKRIT 2013 44 45
My expected output is like this: (It is a bit long)
Bit tricky with pivoting twice, but here you go:
library(tidyverse)
DF %>%
group_by(NAME, SUBJECT) %>%
mutate(ind = row_number()) %>%
ungroup() %>%
pivot_longer(c("YEAR", "MARKS", "MAXIMUM")) %>%
mutate(name = paste0(name, ind)) %>%
select(-ind) %>%
pivot_wider(names_from = c("SUBJECT", "name"), values_from = "value")

How to sort the row order according to number not character?

I want to sort the row order of the data frame according to number, not character. My row indices for my data frame are numeric with an order of 1,10,11,12,2,20,21,22, etc. I have used order() trying to sort my row indices to 1,2,3,4,5,6,7,8,9,10, etc, but my row indices just stayed the same.
So my data frame has 1 column with 11 rows:
structure(list(`colSums(fake_with_noise_boundary)` = c(-3405, 2304,
-4096, 474, -2089, -3921, -2590, 1605, 1317, 2804, 2934)),
row.names = c("1", "10", "11", "12", "2", "20", "21", "3", "30", "31" ,
"40"), class = "data.frame")
rownames are always stored as characters, if you want to sort them according to their numeric value you can change it to numeric and order.
df <- df[order(as.numeric(rownames(df))), , drop = FALSE]
df
# colSums(fake_with_noise_boundary)
#1 -3405
#2 -2089
#3 1605
#10 2304
#11 -4096
#12 474
#20 -3921
#21 -2590
#30 1317
#31 2804
#40 2934
library(tidyverse)
df <-
structure(list(`colSums(fake_with_noise_boundary)` = c(-3405, 2304,
-4096, 474, -2089, -3921, -2590, 1605, 1317, 2804, 2934)),
row.names = c("1", "10", "11", "12", "2", "20", "21", "3", "30", "31" ,
"40"), class = "data.frame")
df %>%
#Create a column with your rowname
rownames_to_column() %>%
#Transform rowname to numeric
mutate(rowname = as.numeric(rowname)) %>%
# Sort row order by rowname
arrange(rowname)
rowname colSums(fake_with_noise_boundary)
1 1 -3405
2 2 -2089
3 3 1605
4 10 2304
5 11 -4096
6 12 474
7 20 -3921
8 21 -2590
9 30 1317
10 31 2804
11 40 2934

Keep certain type on column and only one type on column in R?

I have a table which looks like this:
df1 <- data.frame(
"seqid" = c("12", "12", "13", "12", "12", "15"),
"source" = c("star", "star", "star", "star", "star", "star"),
"type" = c("CDS", "CDS", "CDS", "intron", "CDS", "intron"),
"start" = c("15", "21", "23", "35", "45", "60"),
"end" = c("70", "80", "86", "45", "67", "88"),
"attributes" = c("ENSOCUT00000011013", "ENSOCUT00000064484",
"ENSOCUT00000013302",
"ENSOCUT00000010968", "ENSOCUT00000010968", "ENSOCUT00000060283"),
stringsAsFactors = F,check.names=FALSE)
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
intron
35
45
ENSOCUT00000010968
12
star
CDS
45
67
ENSOCUT00000010968
12
star
intron
60
88
ENSOCUT00000060283
I want my final result to look like this:
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
CDS
45
67
ENSOCUT00000010968
So I want to group_by by only ENSOCUT00000011013, ENSOCUT00000064484, ENSOCUT00000013302, ENSOCUT00000010968 (on attributes column) and keep only the CDS of those on the type column

Easier way to add rows with totals for groups in dplyr

How could I add rows with the sum of VL-FOB_real for each CO_ANO-niv100-subsector group in an easier way? I couldn't figure how to use add_rows and the like to do so, only by creating a new dataframe and then appending it.
Here is what I have done:
df <- structure(list(CO_ANO = c("1996", "1990", "1993", "1993", "1994",
"1992", "1995", "1995", "1996", "1995",
"1994", "1990", "1989", "1992", "1995"),
CO_UF = c("32", "45", "45", "36", "55", "99", "36",
"34", "14", "25", "53", "41", "41", "41", "16"),
niv100 = c("2210","1530", "210", "3210", "1530", "2610", "2210",
"2630", "1030","1020", "3020", "3020", "410", "2510",
"1520"),
subsector = c("11","8", "1", "7", "8", "13", "11", "13", "4", "5",
"13", "13", "2","13", "8"),
VL_FOB_real = c(1, 2, 3,
1, 4, 5,
5, 6, 7,
6, 8, 9,
10, 11, 11)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-15L))
df1 <- df %>%
group_by(CO_ANO, subsector, niv100) %>%
summarise(VL_FOB_real = sum(VL_FOB_real)) %>%
mutate(CO_UF = 'Total')
df <- bind_rows(df1,df)
This groups the rows and then modify each group using adorn_totals.
library(dplyr)
library(janitor)
df %>%
group_by(CO_ANO, CO_UF, niv100) %>%
group_modify(~ adorn_totals(.x, where = "row"))
giving:
# A tibble: 30 x 5
# Groups: CO_ANO, CO_UF, niv100 [15]
CO_ANO CO_UF niv100 subsector VL_FOB_real
<chr> <chr> <chr> <chr> <dbl>
1 1989 41 410 2 10
2 1989 41 410 Total 10
3 1990 41 3020 13 9
4 1990 41 3020 Total 9
5 1990 45 1530 8 2
6 1990 45 1530 Total 2
7 1992 41 2510 13 11
8 1992 41 2510 Total 11
9 1992 99 2610 13 5
10 1992 99 2610 Total 5
# ... with 20 more rows
Another thing to try is the following which gives somewhat different output. It splits the input into groups and applies adorn_totals separately to each group giving a c("tabyl", "tbl_df", "tbl", "data.frame") object.
library(dplyr)
library(janitor)
library(purrr)
df %>%
group_split(CO_ANO, subsector, niv100, CO_UF) %>%
map_df(adorn_totals)
Honestly, I would do what you have done to add rows for each group but for the purpose of demonstrating way to use add_row here's an answer :
library(dplyr)
library(purrr)
df %>%
group_split(CO_ANO, subsector, niv100) %>%
map_df(~add_row(.x, CO_ANO = first(.x$CO_ANO), subsector = first(.x$subsector),
niv100 = first(.x$niv100),VL_FOB_real = sum(.x$VL_FOB_real), CO_UF = 'Total'))
# CO_ANO CO_UF niv100 subsector VL_FOB_real
# <chr> <chr> <chr> <chr> <dbl>
# 1 1989 41 410 2 10
# 2 1989 Total 410 2 10
# 3 1990 41 3020 13 9
# 4 1990 Total 3020 13 9
# 5 1990 45 1530 8 2
# 6 1990 Total 1530 8 2
# 7 1992 41 2510 13 11
# 8 1992 Total 2510 13 11
# 9 1992 99 2610 13 5
#10 1992 Total 2610 13 5
# … with 20 more rows
The only benefit I see of this approach is you get "Total" row for each group immediately after the group unlike in bind_rows where you get all "Total" rows together.

How to match 2 dataframe columns and extract column values and column names?

I have a matrix called mymat. I have a vector called geno <- c("01","N1","11","1N","10"). I have another table called key.table. What I want to do is I want to match the key column in key.table with the key column in mymat and If the column values in any of the matching rows have the any of the geno elements, I want to extract that column name from mymat along with the matching geno element and paste it in the new column in matched.extract in key.table in the corresponding rows for each key and get the result.
mymat <- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list(
c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP",
"AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list(c("34",
"35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
result
key variantId matched.extract
34 "chr5:12111" "9920068" NA
35 "chr5:12111" "9920069" NA
36 "chr5:12113" "9920070" AMLM12001KP (1N),AMLM12014N-R (1N)
37 "chr5:12114" "9920071" AMAS-11.3-Diagnostic (11)
38 "chr5:12118" "9920072" AMAS-11.3-Diagnostic (10)
39 "chr5:12122" "9920073" NA
40 "chr5:12123" "9920074" NA
41 "chr5:12123" "9920075" NA
42 "chr5:12125" "9920076" NA
43 "chr5:12127" "9920077" NA
44 "chr5:12129" "9920078" NA
Using data.table, I would approach it like this:
library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]
which gives:
> kt
rn key variantId matched
1: 34 chr5:12111 9920068 NA
2: 35 chr5:12111 9920069 NA
3: 36 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
4: 37 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
5: 38 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
6: 39 chr5:12122 9920073 NA
7: 40 chr5:12123 9920074 NA
8: 41 chr5:12123 9920075 NA
9: 42 chr5:12125 9920076 NA
10: 43 chr5:12127 9920077 NA
11: 44 chr5:12129 9920078 NA
Explanation:
kt <- as.data.table(key.table, keep.rownames=TRUE) will convert the matrix key.table to a data.table (which is an enhanced data.frame) and stores the rownames in the rn column.
mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key")) will convert the matrix mymat to a data.table, stores the rownames in the rn column and melts the data.table into long format.
the part [value %in% c("1N","11","10"), val := paste0(variable," (",value,")")] will paste the variable-values (which were the columnnams in mymat) with the value-values for only in the cases where value is 1N, 11 or 10.
the part [, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)] will paste the non-NA rows of val together by the rn & key variables.
the part [val=="", val:=NA] will transform the empty rows for val into NA-values
finally kt[mm, matched := val, on=c("rn","key")] updates the kt-data.table by reference with the val-values of the mm-data.table for the matching rn & key variables.
WARNING: When using data.table, it is better not to use key as a variable name as key is also a parameter in a data.table. See ?key for more info.
I'm not that familiar with the dplyr functions. You can try the base R merge function:
mm <- merge(key.table,mymat,by="key",all.x=T)
mm
function to paste the column names with the tissue type:
get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))
final dataframe:
mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
mm[, -3:-5]
key variantId result
1 chr5:12111 9920068
2 chr5:12111 9920068
3 chr5:12111 9920069
4 chr5:12111 9920069
5 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
6 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
7 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
8 chr5:12122 9920073
9 chr5:12123 9920074
10 chr5:12123 9920075
11 chr5:12125 9920076
12 chr5:12127 9920077
13 chr5:12129 9920078
Not exactly sure what you want, but it might be close to this:
library(reshape2)
mymat <- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L),
.Dimnames = list(
c("34", "35", "36", "37", "38"),
c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L),
.Dimnames = list(
c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)
tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
mltdf1
Yielding:
key variantId variable value
5 chr5:12113 9920070 AMLM12001KP 1N
13 chr5:12114 9920071 AMAS.11.3.Diagnostic 11
14 chr5:12118 9920072 AMAS.11.3.Diagnostic 10
19 chr5:12113 9920070 AMLM12014N.R 1N

Resources