Find character in csv, split cells in R - r

I have a .csv file with gene names such as "AT1G45150". However, some entries have two gene names connected by an underscore, so they look like this "AT3G01311_ATCG00940" as seen in line 135. Is there a simple command, perhaps with something like gsub that not only finds and eliminates everything in the cell from the underscore on, but also sticks the second gene name in a cell immediately below the one it was found in, in the same column but the next row down? Also want to keep everything that was already in that column, just extend column length to add new members.
"133","AT1G45150","AT1G12200","AT2G25370","AT1G19715","AT2G46830","AT1G20870","AT4G12400","AT1G19660"
"134","AT1G47280","AT1G12410","AT2G26920","AT1G19750","AT2G46850","AT1G21400","AT4G15430","AT1G19690"
"135","AT1G47317","AT1G12530","AT2G27270","AT1G20540","AT3G01311_ATCG00940","AT1G21450","AT5G01970","AT1G19750"
"136","AT1G47420","AT1G12550","AT2G28590","AT1G20570","AT3G03470","AT1G21730","AT1G20800","AT1G19780"
"137","AT1G47500","AT1G12740","AT2G28970","AT1G20580","AT3G03980","AT1G21760","AT3G54740","AT1G19790"
"138","AT1G47570","AT1G12750","AT2G29740","AT1G20610","AT3G05040","AT1G22000","AT4G12400","AT1G19970"
so that it becomes
"133","AT1G45150","AT1G12200","AT2G25370","AT1G19715","AT2G46830","AT1G20870","AT4G12400","AT1G19660"
"134","AT1G47280","AT1G12410","AT2G26920","AT1G19750","AT2G46850","AT1G21400","AT4G15430","AT1G19690"
"135","AT1G47317","AT1G12530","AT2G27270","AT1G20540","AT3G01311","AT1G21450","AT5G01970","AT1G19750"
"136","AT1G47420","AT1G12550","AT2G28590","AT1G20570","ATCG000940","AT1G21730","AT1G20800","AT1G19780"
"137","AT1G47500","AT1G12740","AT2G28970","AT1G20580","AT3G03470","AT1G21760","AT3G54740","AT1G19790"
"138","AT1G47570","AT1G12750","AT2G29740","AT1G20610","AT3G03980","AT1G22000","AT4G12400","AT1G19970"
Thanks for your help!
edit: trying to provide a reproducible example, hope this is helpful:
> dput(droplevels(genes[133:138,]))
structure(list(g99 = structure(1:6, .Label = c("AT1G45150", "AT1G47280",
"AT1G47317", "AT1G47420", "AT1G47500", "AT1G47570"), class = "factor"),
g95 = structure(1:6, .Label = c("AT1G12200", "AT1G12410",
"AT1G12530", "AT1G12550", "AT1G12740", "AT1G12750"), class = "factor"),
y99 = structure(1:6, .Label = c("AT2G25370", "AT2G26920",
"AT2G27270", "AT2G28590", "AT2G28970", "AT2G29740"), class = "factor"),
y95 = structure(1:6, .Label = c("AT1G19715", "AT1G19750",
"AT1G20540", "AT1G20570", "AT1G20580", "AT1G20610"), class = "factor"),
a99 = structure(1:6, .Label = c("AT2G46830", "AT2G46850",
"AT3G01311_ATCG00940", "AT3G03470", "AT3G03980", "AT3G05040"
), class = "factor"), a95 = structure(1:6, .Label = c("AT1G20870",
"AT1G21400", "AT1G21450", "AT1G21730", "AT1G21760", "AT1G22000"
), class = "factor"), e99 = structure(c(3L, 4L, 5L, 1L, 2L,
3L), .Label = c("AT1G20800", "AT3G54740", "AT4G12400", "AT4G15430",
"AT5G01970"), class = "factor"), e95 = structure(1:6, .Label = c("AT1G19660",
"AT1G19690", "AT1G19750", "AT1G19780", "AT1G19790", "AT1G19970"
), class = "factor")), .Names = c("g99", "g95", "y99", "y95",
"a99", "a95", "e99", "e95"), row.names = 133:138, class = "data.frame")

I'm assuming that these genes are part of a bigger data frame with more information about each gene. I'd use tidyr and dplyr. Something like this should work:
library(dplyr)
library(tidyr)
df <-
df %>%
separate(gene, c('first', 'second'), '_') %>% # Make two columns
gather(position, gene, first, second) %>%
filter(!is.na(gene))
I used separate to split the column into two, with the first column containing the first gene and the second column with the second (if it exists). Then I used gather to stack all the genes on top of each other and filter to remove rows from the missing second gene.
Hope this helps!

Now that I've seen your data I've got a new answer. I'm a little confused about what exactly you want in the dataframe, but here's how to do it for a single vector.
library(stringr)
> df$a99
[1] "AT2G46830" "AT2G46850" "AT3G01311_ATCG00940"
[4] "AT3G03470" "AT3G03980" "AT3G05040"
> unlist(str_split(df$a99, '_'))
[1] "AT2G46830" "AT2G46850" "AT3G01311" "ATCG00940" "AT3G03470" "AT3G03980"
[7] "AT3G05040"

This answer assumes you maybe want to keep the data frame structure.
First load the following three packages:
library(stringr); library(purrr); library(dplyr)
Then your data frame looks like:
> genes
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 133 AT1G45150 AT1G12200 AT2G25370 AT1G19715 AT2G46830 AT1G20870 AT4G12400 AT1G19660
2 134 AT1G47280 AT1G12410 AT2G26920 AT1G19750 AT2G46850 AT1G21400 AT4G15430 AT1G19690
3 135 AT1G47317 AT1G12530 AT2G27270 AT1G20540 AT3G01311_ATCG00940 AT1G21450 AT5G01970 AT1G19750
4 136 AT1G47420 AT1G12550 AT2G28590 AT1G20570 AT3G03470 AT1G21730 AT1G20800 AT1G19780
5 137 AT1G47500 AT1G12740 AT2G28970 AT1G20580 AT3G03980 AT1G21760 AT3G54740 AT1G19790
6 138 AT1G47570 AT1G12750 AT2G29740 AT1G20610 AT3G05040 AT1G22000 AT4G12400 AT1G19970
If I was just to attack the V6 variable, I would use the following commands from stringr:
> str_sub(genes$V6, start = 1L,
end = ifelse(is.na(str_locate(genes$V6, '_')[,1]), -1,
str_locate(genes$V6, '_')[, 1] - 1))
[1] "AT2G46830" "AT2G46850" "AT3G01311" "AT3G03470" "AT3G03980" "AT3G05040"
But we want to generalize this to all the variables, in case you want to keep your data frame structure. So use the map function from purrr to go through all the columns in the data frame (you also might be able use lapply in a similar manner, but sometimes it's hard to coerce to a dataframe).
> genes2 <- map(genes, function(x) { str_sub(x, start = 1L,
end = ifelse(is.na(str_locate(x, '_'))[,1], -1,
str_locate(x, '_')[,1] - 1)) })
%>% as_data_frame()
And your data frame then looks like this:
> genes2
Source: local data frame [6 x 9]
V1 V2 V3 V4 V5 V6 V7 V8 V9
(chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
1 133 AT1G45150 AT1G12200 AT2G25370 AT1G19715 AT2G46830 AT1G20870 AT4G12400 AT1G19660
2 134 AT1G47280 AT1G12410 AT2G26920 AT1G19750 AT2G46850 AT1G21400 AT4G15430 AT1G19690
3 135 AT1G47317 AT1G12530 AT2G27270 AT1G20540 AT3G01311 AT1G21450 AT5G01970 AT1G19750
4 136 AT1G47420 AT1G12550 AT2G28590 AT1G20570 AT3G03470 AT1G21730 AT1G20800 AT1G19780
5 137 AT1G47500 AT1G12740 AT2G28970 AT1G20580 AT3G03980 AT1G21760 AT3G54740 AT1G19790
6 138 AT1G47570 AT1G12750 AT2G29740 AT1G20610 AT3G05040 AT1G22000 AT4G12400 AT1G19970

Related

Keep only rows if number is greater than... in specific column

This is an example of data:
exp_data <- structure(list(Seq = c("AAAARVDS", "AAAARVDSSSAL",
"AAAARVDSRASDQ"), Change = structure(c(19L, 20L, 13L), .Label = c("",
"C[+58]", "C[+58], F[+1152]", "C[+58], F[+1152], L[+12], M[+12]",
"C[+58], L[+2909]", "L[+12]", "L[+370]", "L[+504]", "M[+12]",
"M[+1283]", "M[+1457]", "M[+1491]", "M[+16]", "M[+16], Y[+1013]",
"M[+16], Y[+1152]", "M[+16], Y[+762]", "M[+371]", "M[+386], Y[+12]",
"M[+486], W[+12]", "Y[+12]", "Y[+1240]", "Y[+1502]", "Y[+1988]",
"Y[+2918]"), class = "factor"), `Mass` = c(1869.943,
1048.459, 707.346), Size = structure(c(2L, 2L, 2L), .Label = c("Matt",
"Greg",
"Kieran"
), class = "factor"), `Number` = c(2L, 2L, 2L)), row.names = c(244L,
392L, 396L), class = "data.frame")
I would like to bring your attention to column name Change as this is the one which I would like to use for filtering. We have three rows here and I would like to keep only first one because there is a change bigger than 100 for specific letter. I would like to keep all of the rows which contain the change of letter greater than +100. It might be a situatation that there is up to 4-5 letters in change column but if there is at least one with modification of at least +100 I would like to keep this row.
Do you have any simple solution for that ?
Expected output:
Seq Change Mass Size Number
244 AAAARVDS M[+486], W[+12] 1869.943 Greg 2
Not entirely sure I understood your problem statement correctly, but perhaps something like this
library(dplyr)
library(stringr)
exp_data %>% filter(str_detect(Change, "\\d{3}"))
# Seq Change Mass Size Number
#1 AAAARVDS M[+486], W[+12] 1869.943 Greg 2
Or the same in base R
exp_data[grep("\\d{3}", exp_data$Change), ]
# Seq Change Mass Size Number
#1 AAAARVDS M[+486], W[+12] 1869.943 Greg 2
The idea is to use a regular expression to keep only those rows where Change contains at least one three-digit expression.
You can use str_extract_all from the stringr package
library(stringr)
data.table solution
library(data.table)
setDT(exp_data)
exp_data[, max := max(as.numeric(str_extract_all(Change, "[[:digit:]]+")[[1]])), by = Seq]
exp_data[max > 100, ]
Seq Change Mass Size Number max
1: AAAARVDS M[+486], W[+12] 1869.9 Greg 2 486
dplyr solution
library(dplyr)
exp_data %>%
group_by(Seq) %>%
filter(max(as.numeric(str_extract_all(Change, "[[:digit:]]+")[[1]])) > 100)
# A tibble: 1 x 5
# Groups: Seq [1]
Seq Change Mass Size Number
<chr> <fct> <dbl> <fct> <int>
1 AAAARVDS M[+486], W[+12] 1870. Greg 2

Merging two columns into one based on value

I have a dataset with two columns containing the following: an indicator number and a hashcode
The only problem is that the columns have the same name, but the value can switch columns.
Now I want to merge the columns and keep the number (I don't care about the hashcode)
I saw this question: Merge two columns into one in r
and I tried the coalesce() function, but that is only for having NA values. Which I don't have. I looked at the unite function, but according to the cheat sheet documentation documentation here that doesn't what I'm looking for
My next try was the filter_at and other filter functions from the dplyr package Documentation here
But that only leaves 150 data points while at the start I have 61k data points.
Code of filter_at I tried:
data <- filter_at(data,vars("hk","hk_1"),all_vars(.>0))
I assumed that a #-string shall not be greater than 0, which seems to be true, but it removes more than intented.
I would like to keep hk or hk_1 value which is a number. The other one (the hash) can be removed. Then I want a new column which only contains those numbers.
Sample data
My data looks like this:
HK|HK1
190|#SP0839
190|#SP0340
178|#SP2949
#SP8390|177
#SP2240|212
What I would like to see:
HK
190
190
178
177
212
I hope this provides an insight into the data. There are more columns like description, etc which makes that 190 at the start are not doubles.
We can replace all the values that start with "#" to NA and then use coalesce to select non-NA value between HK and HK1.
library(dplyr)
df %>%
mutate_all(~as.character(replace(., grepl("^#", .), NA))) %>%
mutate(HK = coalesce(HK, HK1)) %>%
select(HK)
# HK
#1 190
#2 190
#3 178
#4 177
#5 212
data
df <- structure(list(HK = structure(c(4L, 4L, 3L, 2L, 1L), .Label = c("#SP2240",
"#SP8390", "178", "190"), class = "factor"), HK1 = structure(c(2L,
1L, 3L, 4L, 5L), .Label = c("#SP0340", "#SP0839", "#SP2949",
"177", "212"), class = "factor")), class = "data.frame", row.names = c(NA, -5L))

Comparing pairs of rows in a list of data frames

I have a list that's 1314 element long. Each element is a data frame consisting of two rows and four columns.
Game.ID Team Points Victory
1 201210300CLE CLE 94 0
2 201210300CLE WAS 84 0
I would like to use the lapply function to compare points for each team in each game, and change Victory to 1 for the winning team.
I'm trying to use this function:
test_vic <- lapply(all_games, function(x) {if (x[1,3] > x[2,3]) {x[1,4] = 1}})
But the result it produces is a list 1314 elements long with just the Game ID and either a 1 or a null, a la:
$`201306200MIA`
[1] 1
$`201306160SAS`
NULL
How can I fix my code so that each data frame maintains its shape. (I'm guessing solving the null part involves if-else, but I need to figure out the right syntax.)
Thanks.
Try
lapply(all_games, function(x) {x$Victory[which.max(x$Points)] <- 1; x})
Or another option would be to convert the list to data.table by using rbindlist and then do the conversion
library(data.table)
rbindlist(all_games)[,Victory:= +(Points==max(Points)) ,Game.ID][]
data
all_games <- list(structure(list(Game.ID = c("201210300CLE",
"201210300CLE"
), Team = c("CLE", "WAS"), Points = c(94L, 84L), Victory = c(0L,
0L)), .Names = c("Game.ID", "Team", "Points", "Victory"),
class = "data.frame", row.names = c("1",
"2")), structure(list(Game.ID = c("201210300CME", "201210300CME"
), Team = c("CLE", "WAS"), Points = c(90, 92), Victory = c(0L,
0L)), .Names = c("Game.ID", "Team", "Points", "Victory"),
row.names = c("1", "2"), class = "data.frame"))
You could try dplyr:
library(dplyr)
all_games %>%
bind_rows() %>%
group_by(Game.ID) %>%
mutate(Victory = row_number(Points)-1)
Which gives:
#Source: local data frame [4 x 4]
#Groups: Game.ID
#
# Game.ID Team Points Victory
#1 201210300CLE CLE 94 1
#2 201210300CLE WAS 84 0
#3 201210300CME CLE 90 0
#4 201210300CME WAS 92 1

How do I plot boxplots of two different series?

I have 2 dataframe sharing the same rows IDs but with different columns
Here is an example
chrom coord sID CM0016 CM0017 CM0018
7 10 3178881 SP_SA036,SP_SA040 0.000000000 0.000000000 0.0009923
8 10 38894616 SP_SA036,SP_SA040 0.000434783 0.000467464 0.0000970
9 11 104972190 SP_SA036,SP_SA040 0.497802888 0.529319536 0.5479003
and
chrom coord sID CM0001 CM0002 CM0003
4 10 3178881 SP_SA036,SA040 0.526806527 0.544927536 0.565610860
5 10 38894616 SP_SA036,SA040 0.009049774 0.002849003 0.002857143
6 11 104972190 SP_SA036,SA040 0.451612903 0.401617251 0.435318275
I am trying to create a composite boxplot figure where I have in x axis the chrom and coord combined (so 3 points) and for each x value 2 boxplots side by side corresponding to the two dataframes ?
What is the best way of doing this ? Should I merge the two dataframes together somehow in order to get only one and loop over the boxplots rendering by 3 columns ?
Any idea on how this can be done ?
The problem is that the two dataframes have the same number of rows but can differ in number of columns
> dim(A)
[1] 99 20
> dim(B)
[1] 99 28
I was thinking about transposing the dataframe in order to get the same number of column but got lost on how to this properly
Thanks in advance
UPDATE
This is what I tried to do
I merged chrom and coord columns together to create a single ID
I used reshape t melt the dataframes
I merged the 2 melted dataframe into a single one
the head looks like this
I have two variable A2 and A4 corresponding to the 2 dataframes
then I created a boxplot such using this
ggplot(A2A4, aes(factor(combine), value)) +geom_boxplot(aes(fill = factor(variable)))
I think it solved my problem but the boxplot looks very busy with 99 x values with 2 boxplots each
So if these are your input tables
d1<-structure(list(chrom = c(10L, 10L, 11L),
coord = c(3178881L, 38894616L, 104972190L),
sID = structure(c(1L, 1L, 1L), .Label = "SP_SA036,SP_SA040", class = "factor"),
CM0016 = c(0, 0.000434783, 0.497802888), CM0017 = c(0, 0.000467464,
0.529319536), CM0018 = c(0.0009923, 9.7e-05, 0.5479003)), .Names = c("chrom",
"coord", "sID", "CM0016", "CM0017", "CM0018"), class = "data.frame", row.names = c("7",
"8", "9"))
d2<-structure(list(chrom = c(10L, 10L, 11L), coord = c(3178881L,
38894616L, 104972190L), sID = structure(c(1L, 1L, 1L), .Label = "SP_SA036,SA040", class = "factor"),
CM0001 = c(0.526806527, 0.009049774, 0.451612903), CM0002 = c(0.544927536,
0.002849003, 0.401617251), CM0003 = c(0.56561086, 0.002857143,
0.435318275)), .Names = c("chrom", "coord", "sID", "CM0001",
"CM0002", "CM0003"), class = "data.frame", row.names = c("4",
"5", "6"))
Then I would combine and reshape the data to make it easier to plot. Here's what i'd do
m1<-melt(d1, id.vars=c("chrom", "coord", "sID"))
m2<-melt(d2, id.vars=c("chrom", "coord", "sID"))
dd<-rbind(cbind(m1, s="T1"), cbind(m2, s="T2"))
mm$pos<-factor(paste(mm$chrom,mm$coord,sep=":"),
levels=do.call(paste, c(unique(dd[order(dd[[1]],dd[[2]]),1:2]), sep=":")))
I first melt the two input tables to turn columns into rows. Then I add a column to each table so I know where the data came from and rbind them together. And finally I do a bit of messy work to make a factor out of the chr/coord pairs sorted in the correct order.
With all that done, I'll make the plot like
ggplot(mm, aes(x=pos, y=value, color=s)) +
geom_boxplot(position="dodge")
and it looks like

Calculating subtotals (sum, stdev, average etc)

I have been searching for this for a while, but haven't been able to find a clear answer so far. Probably have been looking for the wrong terms, but maybe somebody here can quickly help me. The question is kind of basic.
Sample data set:
set <- structure(list(VarName = structure(c(1L, 5L, 4L, 2L, 3L),
.Label = c("Apple/Blue/Nice",
"Apple/Blue/Ugly", "Apple/Pink/Ugly", "Kiwi/Blue/Ugly", "Pear/Blue/Ugly"
), class = "factor"), Color = structure(c(1L, 1L, 1L, 1L, 2L), .Label = c("Blue",
"Pink"), class = "factor"), Qty = c(45L, 34L, 46L, 21L, 38L)), .Names = c("VarName",
"Color", "Qty"), class = "data.frame", row.names = c(NA, -5L))
This gives a data set like:
set
VarName Color Qty
1 Apple/Blue/Nice Blue 45
2 Pear/Blue/Ugly Blue 34
3 Kiwi/Blue/Ugly Blue 46
4 Apple/Blue/Ugly Blue 21
5 Apple/Pink/Ugly Pink 38
What I would like to do is fairly straight forward. I would like to sum (or averages or stdev) the Qty column. But, also I would like to do the same operation under the following conditions:
VarName includes "Apple"
VarName includes "Ugly"
Color equals "Blue"
Anybody that can give me a quick introduction on how to perform this kind of calculations?
I am aware that some of it can be done by the aggregate() function, e.g.:
aggregate(set[3], FUN=sum, by=set[2])[1,2]
However, I believe that there is a more straight forward way of doing this then this. Are there some filters that can be added to functions like sum()?
The easiest way to to split up your VarName column, then subsetting becomes very easy. So, lets create an object were varName has been separated:
##There must(?) be a better way than this. Anyone?
new_set = t(as.data.frame(sapply(as.character(set$VarName), strsplit, "/")))
Brief explanation:
We use as.character because set$VarName is a factor
sapply takes each value in turn and applies strplit
The strsplit function splits up the elements
We convert to a data frame
Transpose to get the correct rotation
Next,
##Convert to a data frame
new_set = as.data.frame(new_set)
##Make nice rownames - not actually needed
rownames(new_set) = 1:nrow(new_set)
##Add in the Qty column
new_set$Qty = set$Qty
This gives
R> new_set
V1 V2 V3 Qty
1 Apple Blue Nice 45
2 Pear Blue Ugly 34
3 Kiwi Blue Ugly 46
4 Apple Blue Ugly 21
5 Apple Pink Ugly 38
Now all the operations are as standard. For example,
##Add up all blue Qtys
sum(new_set[new_set$V2 == "Blue",]$Qty)
[1] 146
##Average of Blue and Ugly Qtys
mean(new_set[new_set$V2 == "Blue" & new_set$V3 == "Ugly",]$Qty)
[1] 33.67
Once it's in the correct form, you can use ddply which does every you want (and more)
library(plyr)
##Split the data frame up by V1 and take the mean of Qty
ddply(new_set, .(V1), summarise, m = mean(Qty))
##Split the data frame up by V1 & V2 and take the mean of Qty
ddply(new_set, .(V1, V2), summarise, m = mean(Qty))
Is this what you're looking for?
# sum for those including 'Apple'
apple <- set[grep('Apple', set[, 'VarName']), ]
aggregate(apple[3], FUN=sum, by=apple[2])
Color Qty
1 Blue 66
2 Pink 38
# sum for those including 'Ugly'
ugly <- set[grep('Ugly', set[, 'VarName']), ]
aggregate(ugly[3], FUN=sum, by=ugly[2])
Color Qty
1 Blue 101
2 Pink 38
# sum for Color==Blue
sum(set[set[, 'Color']=='Blue', 3])
[1] 146
The last sum could be done by using subset
sum(subset(set, Color=='Blue')[,3])

Resources