Remove replicate number from end of string - r

I just need to remove all replicate numbers and letter "R" from the end of all rows in a column, strain and create a new column with those results in mutant, preferable using dplyr so I can pipe the results forward.
For example
print(df)
strain measurement
1 CK522R1 75
2 CN344attBR1 50
3 GL065R13 32
4 GL078R100 27
Desired Output
strain measurement mutant
1 CK522R1 75 CK522
2 CN344attBR1 50 CN344attB
3 GL065R13 32 GL065
4 GL078R100 27 GL078
Reproducible Data
structure(list(strain = structure(1:4, .Label = c("CK522R1",
"CN344attBR1", "GL065R13", "GL078R100"), class = "factor"), measurement = c(75,
50, 32, 27)), class = "data.frame", row.names = c(NA, -4L))

From d.b's comment:
library(dplyr)
df %>% mutate(mutant=sub("R\\d+$", "",strain),replicate=regmatches(strain, regexpr("R\\d+$", strain)))

Related

Replace string by match to another data frame

I want to replace the strings in column ID of df2 with the column genus of df1 based on the matching string in column species in df1. Any tips appreciated, especially dplyr. Maybe left_join?
> df1
genus species
1 Orthobunyavirus Variola virus
2 Alphatorquevirus Torque teno virus 6
3 Yatapoxvirus Yaba-like disease virus
.
> df2
ID
1 Variola virus
2 Torque teno virus 6
3 Yaba-like disease virus
.
desired out
ID
1 Orthobunyavirus
2 Alphatorquevirus
3 Yatapoxvirus
> dput(df1)
structure(list(genus = c("Orthobunyavirus", "Alphatorquevirus",
"Yatapoxvirus"), species = c("Variola virus", "Torque teno virus 6",
"Yaba-like disease virus")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(ID = c("Variola virus", "Torque teno virus 6",
"Yaba-like disease virus")), class = "data.frame", row.names = c(NA,
-3L))
You could simply use match
df2$ID <- df1$genus[match(df2$ID, df1$species)]
df2
#> ID
#> 1 Orthobunyavirus
#> 2 Alphatorquevirus
#> 3 Yatapoxvirus
df2$ID <- df1$genus[match(df2$ID,df1$species)]
replaces it, removing your original df2 data
df3 <- data.frame(ID = df1$genus[match(df2$ID,df1$species)])
creates a third df with the results.

How to get sum of column from sqldf output in R?

I would like to sum a single column of data that was output from an sqldf function in R.
I have a csv. file that contains groupings of sites with a uniqueID and their associated areas. For example:
occurrenceID sarea
{0255531B-904F-4E2D-B81D-797A21165A2F} 0.30626786
{0255531B-904F-4E2D-B81D-797A21165A2F} 0.49235953
{0255531B-904F-4E2D-B81D-797A21165A2F} 0.03490536
{0255531B-904F-4E2D-B81D-797A21165A2F} 0.00001389
{175A4B1C-CA8C-49F6-9CD6-CED9187579DC} 0.0302389
{175A4B1C-CA8C-49F6-9CD6-CED9187579DC} 0.01360811
{1EC60400-0AD0-4DB5-B815-221C4123AE7F} 0.08412911
{1EC60400-0AD0-4DB5-B815-221C4123AE7F} 0.01852466
I used the code below in R to pull out the largest area from each grouping of unique ID's.
> MyData <- read.csv(file="sacandaga2.csv", header=TRUE, sep=",")
> sqldf("select max(sarea),occurrenceID from MyData group by occurrenceID")
This produced the following output:
max(sarea) occurrenceID
1 0.49235953 {0255531B-904F-4E2D-B81D-797A21165A2F}
2 0.03023890 {175A4B1C-CA8C-49F6-9CD6-CED9187579DC}
3 0.08412911 {1EC60400-0AD0-4DB5-B815-221C4123AE7F}
4 0.00548259 {2412E244-2E9A-4477-ACC6-1EB02503BE75}
5 0.00295924 {40450574-ABEB-48E3-9BE5-09B5AB65B465}
6 0.01403846 {473FB631-D398-46B7-8E85-E63540BDFF92}
7 0.00257519 {4BABDE22-E8E0-435E-B60D-0BB9A84E1489}
8 0.02158115 {5F616A33-B028-46B1-AD92-89EAC1660C41}
9 0.00191211 {70067496-25B6-4337-8C70-782143909EF9}
10 0.03049355 {7F858EBB-132E-483F-BA36-80CE889373F5}
11 0.03947298 {9A579565-57EC-4E46-95ED-79724FA6F2AB}
12 0.02464722 {A9010BA3-0FE1-40B1-96A7-21122261A003}
13 0.00136672 {AAD710BF-1539-4235-87F1-34B66CF90781}
14 0.01139146 {AB1286C3-DBE3-467B-99E1-AEEF88A1B5B2}
15 0.07954269 {BED0433A-7167-4184-A25F-B9DBD358AFFB}
16 0.08401067 {C4EF0F45-5BF7-4F7C-BED8-D6B2DB718CB2}
17 0.04289261 {C58AC2C6-BDBE-4FE5-BD51-D70BBDFB4DB5}
18 0.03151558 {D4230F9C-80E4-454A-9D5D-0E373C6DCD9A}
19 0.00403585 {DD76A03A-CFBF-41E9-A571-03DA707BEBDA}
20 0.00007336 {E20DE254-8A0F-40BE-90D2-D6B71880E2A8}
21 9.81847859 {F382D5A6-F385-426B-A543-F5DE13F94564}
22 0.00815881 {F9032905-074A-468F-B60E-26371CF480BB}
23 0.24717113 {F9E5DC3C-4602-4C80-B00B-2AF1D605A265}
Now I would like to sum all the values in the max(sarea) column. What is the best way to accomplish this?
Either do it in sqldf or R, or assign your existing result and do it in R:
# assign your original
grouped_sum = sqldf("select max(sarea),occurrenceID from MyData group by occurrenceID")
# and sum in R
sum(grouped_sum$`max(sarea)`)
# you might prefer to use a standard column name so you don't need backticks
grouped_sum = sqldf(
"select max(sarea) as max_sarea, occurrenceID
from MyData
group by occurrenceID"
)
sum(grouped_sum$max_sarea)
If the intention is to do this in a single 'sqldf' call, use with
library(sqldf)
sqldf("with tmpdat AS (
select max(sarea) as mxarea, occurrenceID
from MyData group by occurrenceID
) select sum(mxarea)
as smxarea from tmpdat")
# smxarea
#1 0.6067275
data
MyData <-
structure(list(occurrenceID = c("{0255531B-904F-4E2D-B81D-797A21165A2F}",
"{0255531B-904F-4E2D-B81D-797A21165A2F}", "{0255531B-904F-4E2D-B81D-797A21165A2F}",
"{0255531B-904F-4E2D-B81D-797A21165A2F}", "{175A4B1C-CA8C-49F6-9CD6-CED9187579DC}",
"{175A4B1C-CA8C-49F6-9CD6-CED9187579DC}", "{1EC60400-0AD0-4DB5-B815-221C4123AE7F}",
"{1EC60400-0AD0-4DB5-B815-221C4123AE7F}"), sarea = c(0.30626786,
0.49235953, 0.03490536, 1.389e-05, 0.0302389, 0.01360811, 0.08412911,
0.01852466)), class = "data.frame", row.names = c(NA, -8L))
You can do it by getting the sum of maximum values:
sqldf("select sum(max_sarea) as sum_of_max_sarea
from (select max(sarea) as max_sarea,
occurrenceID from Mydata group by occurrenceID)")
# sum_of_max_sarea
# 1 0.6067275
Data:
Mydata <- structure(list(occurrenceID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
.Label = c("0255531B-904F-4E2D-B81D-797A21165A2F", "175A4B1C-CA8C-49F6-9CD6-CED9187579DC",
"1EC60400-0AD0-4DB5-B815-221C4123AE7F"), class = "factor"),
sarea = c(0.30626786, 0.49235953, 0.03490536, 1.389e-05, 0.0302389,
0.01360811, 0.08412911, 0.01852466)), class = "data.frame",
row.names = c(NA, -8L))
If DF is the last data frame shown in the question this sums the numeric column:
sqldf("select sum([max(sarea)]) as sum from DF")
## sum
## 1 11.07853
Note
We assume this data frame shown in reproducible form:
Lines <- "max(sarea) occurrenceID
1 0.49235953 {0255531B-904F-4E2D-B81D-797A21165A2F}
2 0.03023890 {175A4B1C-CA8C-49F6-9CD6-CED9187579DC}
3 0.08412911 {1EC60400-0AD0-4DB5-B815-221C4123AE7F}
4 0.00548259 {2412E244-2E9A-4477-ACC6-1EB02503BE75}
5 0.00295924 {40450574-ABEB-48E3-9BE5-09B5AB65B465}
6 0.01403846 {473FB631-D398-46B7-8E85-E63540BDFF92}
7 0.00257519 {4BABDE22-E8E0-435E-B60D-0BB9A84E1489}
8 0.02158115 {5F616A33-B028-46B1-AD92-89EAC1660C41}
9 0.00191211 {70067496-25B6-4337-8C70-782143909EF9}
10 0.03049355 {7F858EBB-132E-483F-BA36-80CE889373F5}
11 0.03947298 {9A579565-57EC-4E46-95ED-79724FA6F2AB}
12 0.02464722 {A9010BA3-0FE1-40B1-96A7-21122261A003}
13 0.00136672 {AAD710BF-1539-4235-87F1-34B66CF90781}
14 0.01139146 {AB1286C3-DBE3-467B-99E1-AEEF88A1B5B2}
15 0.07954269 {BED0433A-7167-4184-A25F-B9DBD358AFFB}
16 0.08401067 {C4EF0F45-5BF7-4F7C-BED8-D6B2DB718CB2}
17 0.04289261 {C58AC2C6-BDBE-4FE5-BD51-D70BBDFB4DB5}
18 0.03151558 {D4230F9C-80E4-454A-9D5D-0E373C6DCD9A}
19 0.00403585 {DD76A03A-CFBF-41E9-A571-03DA707BEBDA}
20 0.00007336 {E20DE254-8A0F-40BE-90D2-D6B71880E2A8}
21 9.81847859 {F382D5A6-F385-426B-A543-F5DE13F94564}
22 0.00815881 {F9032905-074A-468F-B60E-26371CF480BB}
23 0.24717113 {F9E5DC3C-4602-4C80-B00B-2AF1D605A265}"
DF <- read.table(text = Lines, check.names = FALSE)

reshape (converting to character)

i have a the following matrix
data=structure(list(Metric = c("x", "y"), Result1 = c(9,
18), Result2 = c(7, 14
), Delta = c(-170, -401)), .Names = c("Metric",
"Result_1", "Result_2", "Delta"), row.names = c(NA,
-2L), class = "data.frame")
I would like to transform it in 1 row and to double the number of columns
I tried the following (as.vector(t(data))). However when I do that it transofrms everything to character and i loose data information
Any help?
We can split the data frame and then use bind_cols from the dplyr package. Although I am not sure this is your expected output as you did not provide any examples, just description.
dplyr::bind_cols(split(data, data$Metric))
Metric Result_1 Result_2 Delta Metric1 Result_11 Result_21 Delta1
1 x 9 7 -170 y 18 14 -401
In Base R
dd=stack(data)
A=dd$values
names(A)=dd$ind
data.frame(as.list(A))
Metric Metric.1 Result_1 Result_1.1 Result_2 Result_2.1 Delta Delta.1
1 x y 9 18 7 14 -170 -401

Comparing pairs of rows in a list of data frames

I have a list that's 1314 element long. Each element is a data frame consisting of two rows and four columns.
Game.ID Team Points Victory
1 201210300CLE CLE 94 0
2 201210300CLE WAS 84 0
I would like to use the lapply function to compare points for each team in each game, and change Victory to 1 for the winning team.
I'm trying to use this function:
test_vic <- lapply(all_games, function(x) {if (x[1,3] > x[2,3]) {x[1,4] = 1}})
But the result it produces is a list 1314 elements long with just the Game ID and either a 1 or a null, a la:
$`201306200MIA`
[1] 1
$`201306160SAS`
NULL
How can I fix my code so that each data frame maintains its shape. (I'm guessing solving the null part involves if-else, but I need to figure out the right syntax.)
Thanks.
Try
lapply(all_games, function(x) {x$Victory[which.max(x$Points)] <- 1; x})
Or another option would be to convert the list to data.table by using rbindlist and then do the conversion
library(data.table)
rbindlist(all_games)[,Victory:= +(Points==max(Points)) ,Game.ID][]
data
all_games <- list(structure(list(Game.ID = c("201210300CLE",
"201210300CLE"
), Team = c("CLE", "WAS"), Points = c(94L, 84L), Victory = c(0L,
0L)), .Names = c("Game.ID", "Team", "Points", "Victory"),
class = "data.frame", row.names = c("1",
"2")), structure(list(Game.ID = c("201210300CME", "201210300CME"
), Team = c("CLE", "WAS"), Points = c(90, 92), Victory = c(0L,
0L)), .Names = c("Game.ID", "Team", "Points", "Victory"),
row.names = c("1", "2"), class = "data.frame"))
You could try dplyr:
library(dplyr)
all_games %>%
bind_rows() %>%
group_by(Game.ID) %>%
mutate(Victory = row_number(Points)-1)
Which gives:
#Source: local data frame [4 x 4]
#Groups: Game.ID
#
# Game.ID Team Points Victory
#1 201210300CLE CLE 94 1
#2 201210300CLE WAS 84 0
#3 201210300CME CLE 90 0
#4 201210300CME WAS 92 1

Get the time between consecutive dates stored in a single column

I am trying to figure out how to get the time between consecutive events when events are stored as a column of dates in a dataframe.
sampledf=structure(list(cust = c(1L, 1L, 1L, 1L), date = structure(c(9862,
9879, 10075, 10207), class = "Date")), .Names = c("cust", "date"
), row.names = c(NA, -4L), class = "data.frame")
I can get an answer with
as.numeric(rev(rev(difftime(c(sampledf$date[-1],0),sampledf$date))[-1]))
# [1] 17 196 132
but it is really ugly. Among other things, I only know how to exclude the first item in a vector, but not the last so I have to rev() twice to drop the last value.
Is there a better way?
By the way, I will use ddply to do this to a larger set of data for each cust id, so the solution would need to work with ddply.
library(plyr)
ddply(sampledf,
c("cust"),
summarize,
daysBetween = as.numeric(rev(rev(difftime(c(date[-1],0),date))[-1]))
)
Thank you!
Are you looking for this?
as.numeric(diff(sampledf$date))
# [1] 17 196 132
To remove the last element, use head:
head(as.numeric(diff(sampledf$date)), -1)
# [1] 17 196
require(plyr)
ddply(sampledf, .(cust), summarise, daysBetween = as.numeric(diff(date)))
# cust daysBetween
# 1 1 17
# 2 1 196
# 3 1 132
You can just use diff.
as.numeric(diff(sampledf$date))
To leave off the last, element, you can do:
[-length(vec)] #where `vec` is your vector
In this case I don't think you need to leave anything off though, because diff is already one element shorter:
test <- ddply(sampledf,
c("cust"),
summarize,
daysBetween = as.numeric(diff(sampledf$date)
))
test
# cust daysBetween
#1 1 17
#2 1 196
#3 1 132

Resources