I have a the following data.frame:
d <- data.frame(id = c(1:20),
name = c("Paraffinole (CAS 8042-47-5)", "Pirimicarb", "Rapsol", "Thiacloprid",
"Chlorantraniliprole", "Flonicamid", "Tebufenozid", "Fenoxycarb",
"Bacillus thuringiensis subspecies", "aizawai Stamm AB", "Methoxyfenozide",
"Acequinocyl", "lndoxacarb", "Acetamiprid", "Spirotet_r:amat",
"Cydia pomonella Granulovirus", "mexikanischer Stamm", "lmidacloprid",
"Spirodiclofen", "Pyrethrine"),
desc = LETTERS[1:20])
The name column contains two entries of the string 'stamm'. Id' like to select these entries and paste them to the one column entry before and then delete this row. So df$name[9] should finally look like this Bacillus thuringiensis subspecies__aizawai Stamm AB and df$name[16] as follows: Cydia pomonella Granulovirus__mexikanischer Stamm. d$name[c(10,17] should then be deleted.
How can I match a string and paste it to the row above?
What about this ?
library(stringr)
d$name <- as.character(d$name)
where_stamm <- which(str_detect(d$name, "Stamm") == TRUE)
for (i in where_stamm) {
d$name[i-1] <- paste(d$name[i-1], d$name[i], sep = '__')
}
d <- d[-where_stamm, ]
> d$name[9]
[1] "Bacillus thuringiensis subspecies__aizawai Stamm AB"
> d$name[15]
[1] "Cydia pomonella Granulovirus__mexikanischer Stamm"
(note that "Cydia pomonella...." will now be at position 15, since we deleted row 10)
Here is a solution using dplyr:
library(dplyr)
d %>%
mutate(
to_delete = grepl("stamm", name, ignore.case = TRUE),
name = if_else(lead(to_delete, default = FALSE), paste(name, lead(name), sep = "__"),
as.character(name))
) %>%
filter(!to_delete) %>%
select(- to_delete)
# id name desc
# 1 1 Paraffinole (CAS 8042-47-5) A
# 2 2 Pirimicarb B
# 3 3 Rapsol C
# 4 4 Thiacloprid D
# 5 5 Chlorantraniliprole E
# 6 6 Flonicamid F
# 7 7 Tebufenozid G
# 8 8 Fenoxycarb H
# 9 9 Bacillus thuringiensis subspecies__aizawai Stamm AB I
# 10 11 Methoxyfenozide K
# 11 12 Acequinocyl L
# 12 13 lndoxacarb M
# 13 14 Acetamiprid N
# 14 15 Spirotet_r:amat O
# 15 16 Cydia pomonella Granulovirus__mexikanischer Stamm P
# 16 18 lmidacloprid R
# 17 19 Spirodiclofen S
# 18 20 Pyrethrine T
Related
I have two dataframes A and B, that share have the same column names and the same first column (Location)
A <- data.frame("Location" = 1:3, "X" = c(21,15, 7), "Y" = c(41,5, 5), "Z" = c(12,103, 88))
B <- data.frame("Location" = 1:3, "X" = c(NA,NA, 14), "Y" = c(50,8, NA), "Z" = c(NA,14, 12))
How do i replace the values in dataframe A with the values from B if the value in B is not NA?
Thanks.
We can use coalesce
library(dplyr)
A %>%
mutate(across(-Location, ~ coalesce(B[[cur_column()]], .)))
-output
# Location X Y Z
#1 1 21 50 12
#2 2 15 8 14
#3 3 14 5 12
Here's an answer in base R:
i <- which(!is.na(B),arr.ind = T)
A[i] <- B[i]
A
Location X Y Z
1 1 21 50 12
2 2 15 8 14
3 3 14 5 12
One option with fcoalesce from data.table pakcage
list2DF(Map(data.table::fcoalesce,B,A))
gives
Location X Y Z
1 1 21 50 12
2 2 15 8 14
3 3 14 5 12
Need help to split one dataframe dynamically into multiple smaller dataframe’s based on a column interval and save them as well.
Example:
x = data.frame(num = 1:26, let = letters, LET = LETTERS)
The above dataframe x needs to split into smaller dataframes based on value in num, in an interval of 5.
The result would be 6 dataframes
> 1. 0 – 5
> 2. 6 – 10
> 3. 11 – 15
> 4. 16 -20
> 5. 21 -25
> 6. 26 – 30
You can use the split function and cut function to perform the operation:
x = data.frame(num = 1:26, let = letters, LET = LETTERS)
answer<-split(x, cut(x$num, breaks=c(0, 5, 10, 15, 20, 25, 30)))
you can then pass this list to lapply for further processing.
Using tidyverse
library(tidyverse)
x = data.frame(num = 1:26, let = letters, LET = LETTERS)
##Brake the data frame
y <- x %>%
mutate(group = cut_width(num,5, boundary = 0,closed = "right"))
##Put them into a list
list_1 <- lapply(1:length(unique(y$group)),
function(i)filter(y, group == unique(y$group)[i]))
Consider also tagging records by multiples of 5 then running by, the function to split a data frame by one or more factors:
df <- data.frame(num = 1:26, let = letters, LET = LETTERS)
df$grp <- ceiling(df$num / 5)
df_list <- by(df, df$grp, function(sub) transform(sub, grp=NULL))
Output
df_list
# df$grp: 1
# num let LET
# 1 1 a A
# 2 2 b B
# 3 3 c C
# 4 4 d D
# 5 5 e E
# -------------------------------------------------------------------------------------------
# df$grp: 2
# num let LET
# 6 6 f F
# 7 7 g G
# 8 8 h H
# 9 9 i I
# 10 10 j J
# -------------------------------------------------------------------------------------------
# df$grp: 3
# num let LET
# 11 11 k K
# 12 12 l L
# 13 13 m M
# 14 14 n N
# 15 15 o O
# -------------------------------------------------------------------------------------------
# df$grp: 4
# num let LET
# 16 16 p P
# 17 17 q Q
# 18 18 r R
# 19 19 s S
# 20 20 t T
# -------------------------------------------------------------------------------------------
# df$grp: 5
# num let LET
# 21 21 u U
# 22 22 v V
# 23 23 w W
# 24 24 x X
# 25 25 y Y
# -------------------------------------------------------------------------------------------
# df$grp: 6
# num let LET
# 26 26 z Z
This seems to be a neater way. You can easily adjust the names of the output files and the number of splits
library(tidyverse)
df <- data.frame(num = 1:26, let = letters, LET = LETTERS)
# split data frame into 6 pieces
split_df <- split(df, ceiling(1:nrow(df) / nrow(df) * 6))
# save each of them in turn
split_df %>%
names(.) %>%
walk(~ write_csv(split_df[[.]], paste0("part_", ., ".csv")))
Say I have a dataframe like this:
set.seed(1)
n <- 20
df <- data.frame(ID = sample(1:5, n, replace = TRUE),
Fac1 = sample(letters[1:5], n, replace = TRUE),
Fac2 = sample(LETTERS[10:15], n, replace = TRUE),
Val1 = sample(1:10, n, replace = TRUE)) %>%
arrange(ID) %>% group_by(ID,Fac1) %>%
summarise(Val1 = sum(Val1),Fac2 = first(Fac2)) %>%
group_by(ID,Fac2) %>%
mutate(Val2 = sum(Val1))
df
ID Fac1 Val1 Fac2 Val2
1 1 b 9 N 9
2 1 c 9 O 9
3 2 a 4 K 4
4 2 b 10 M 18
5 2 c 4 L 4
6 2 d 8 M 18
7 2 e 10 N 10
8 3 d 14 N 14
9 4 b 8 L 22
10 4 c 14 L 22
11 4 d 9 K 9
12 4 e 6 N 6
13 5 a 13 M 13
14 5 b 3 N 3
ID is a grouping variable. Rows with an Fac1 value of e should have the Fac2 value changed to be that same as the other row in the group where Fac1 is either b or c and the sum of Val 2 for the two rows if greater than 20. (I've simplified this to the point where you probably don't get why but just work with me).
This is what I have tried so far:
result <- df %>% group_by(ID) %>%
mutate(Fac2 = case_when(
Fac1 == "e" &
sum(Val2,ifelse(Fac1 %in% c("b","c"), Val2, 0)) > 20 ~
ifelse(sum(Val2,ifelse(Fac1 %in% c("b","c"),Val2,0)) > 20,
as.character(Fac2),
NA_character_),
TRUE ~ as.character(Fac2)
))
It doesn't work properly because it is summing the first value of Val2 in the group rather than only doing so when Fac1 is b or c.
Any ideas?
Adding desired outcome:
ID Fac1 Val1 Fac2 Val2
1 1 b 9 N 9
2 1 c 9 O 9
3 2 a 4 K 4
4 2 b 10 M 18
5 2 c 4 L 4
6 2 d 8 M 18
7 2 e 10 M 10 **Changed to M b/c row 4 is M and 10 + 18 > 20
8 3 d 14 N 14
9 4 b 8 L 22
10 4 c 14 L 22
11 4 d 9 K 9
12 4 e 6 L 6 **Changed to L b/c row 10 is L and 6 + 22 > 20
13 5 a 13 M 13
14 5 b 3 N 3
I'm having a hard time following what you are wanting the values to be changed to.
But when I have multiple conditions or decisions that need to be made in a sequence, I use a loop and a series of if statements to go through the data frame. I prefer while loops, so that's what I'll use in the example.
counter <- 1
stopper <- nrow(df)
while (counter <= stopper) {
fac1 <- df$Fac1[counter1]
if (fac1 == 'e') {
if ([INSERT NEXT CONDITION]) #Change whichever value your trying to change using the counter to reference the correct row.
else #Change whichever value your trying to change using the counter to reference the correct row.
}
counter <- counter + 1
}
For me, simplifying the code makes it a lot easier for me to keep track of what decisions are being made. It also allows for complex decisions that are difficult to get functions to work with.
I was able to get the desired result with this code. I made a new column containing the result of the test for what value to replace Fac2 with, which wasn't entirely necessary but makes it more readable and debugable.
The key thing was to use first(na.omit()) to get the value from a different row in the same group which met the condition.
result <- df %>% group_by(ID) %>%
mutate(Max_bc_Val = ifelse(Val2 == max(ifelse(Fac1 %in% c("b","c"),
Val2,0)),
ifelse(Fac1 %in% c("b","c"),
as.character(Fac2),NA),NA)) %>%
mutate(Fac2 = case_when(
Fac1 == "e" ~ ifelse(is.na(first(na.omit(Max_bc_Val))),
NA_character_,
first(na.omit(Max_bc_Val))),
TRUE ~ as.character(Fac2)))
This works but doesn't seem like the best solution. Any other ideas?
I have a dataframe, say
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
I want to remove only those rows in which one or multiple ts are directly in between a d and a c, in all other cases I want to retain the cases. So for this example, I would like to remove the ts on row 8, 18 and 19, but keep the others. I have over thousands of cases so doing this manually would be a true horror. Any help is very much appreciated.
One option would be to use rle to get runs of the same string and then you can use an sapply to check forward/backward and return all the positions you want to drop:
rle_vals <- rle(as.character(df$x))
drop <- unlist(sapply(2:length(rle_vals$values), #loop over values
function(i, vals, lengths) {
if(vals[i] == "t" & vals[i-1] == "d" & vals[i+1] == "c"){#Check if value is "t", previous is "d" and next is "c"
(sum(lengths[1:i-1]) + 1):sum(lengths[1:i]) #Get row #s
}
},vals = rle_vals$values, lengths = rle_vals$lengths))
drop
#[1] 8 18 19
df[-drop,]
# x y
#1 a 2
#2 a 4
#3 b 5
#4 b 2
#5 b 6
#6 c 2
#7 d 4
#9 c 2
#10 b 6
#11 t 2
#12 c 4
#13 t 5
#14 a 2
#15 a 6
#16 b 2
#17 d 4
#20 c 6
This also works, by collapsing to a string, identifying groups of t's between d and c (or c and d - not sure whether you wanted this option as well), then working out where they are and removing the rows as appropriate.
df = data.frame(x=c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y=c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6),stringsAsFactors = FALSE)
dfs <- paste0(df$x,collapse="") #collapse to a string
dfs2 <- do.call(rbind,lapply(list(gregexpr("dt+c",dfs),gregexpr("ct+d",dfs)),
function(L) data.frame(x=L[[1]],y=attr(L[[1]],"match.length"))))
dfs2 <- dfs2[dfs2$x>0,] #remove any -1 values (if string not found)
drop <- unlist(mapply(function(a,b) (a+1):(a+b-2),dfs2$x,dfs2$y))
df2 <- df[-drop,]
Here is another solution with base R:
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
#
s <- paste0(df$x, collapse="")
L <- c(NA, NA)
while (TRUE) {
r <- regexec("dt+c", s)[[1]]
if (r[1]==-1) break
L <- rbind(L, c(pos=r[1]+1, length=attr(r, "match.length")-2))
s <- sub("d(t+)c", "x\\1x", s)
}
L <- L[-1,]
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]
# > drop
# 8 18 19
# > df[-drop, ]
# x y
# 1 a 2
# 2 a 4
# 3 b 5
# 4 b 2
# 5 b 6
# 6 c 2
# 7 d 4
# 9 c 2
# 10 b 6
# 11 t 2
# 12 c 4
# 13 t 5
# 14 a 2
# 15 a 6
# 16 b 2
# 17 d 4
# 20 c 6
With gregexpr() it is shorter:
s <- paste0(df$x, collapse="")
g <- gregexpr("dt+c", s)[[1]]
L <- data.frame(pos=g+1, length=attr(g, "match.length")-2)
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]
Suppose I have a data frame that looks like this.
# start end motif
# 2 6 a
# 10 15 b
# 30 35 c
How would I create a data frame that fills in the remaining start and end locations like so up to a certain number Max_end:
Max_end <- 33
# start end motif
# 0 2 na # <- 0-2 are filled in because it is not in the original data frame
# 2 6 a # <- 2-6 are in the original
# 6 10 na # <- 6-10 is not
# 10 15 b # <- 10-15 is
# 15 30 na # and so on
# 30 33 c
And further, calculates the distance between the start and end locations and creates a one column data frame.
# Length motif
# 2 na
# 4 a
# 4 na
# 5 b
# 15 na
# 3 c
Currently this is how i am doing it: It is very inefficient
library(data.table)
library(stringi)
f <- fread('ABC.txt',header=F,skip=1)$V1
f <- paste(f, collapse = "")
motifs = c('GATC', 'CTGCAG', 'ACCACC', 'CC(A|T)GG', 'CCAC.{8}TGA(C|T)')
v <- na.omit(data.frame(do.call(rbind, lapply(stri_locate_all_regex(f, motifs), unlist))))
v <- v[order(v[,1]),]
v2difference <- "blah"
for(i in 2:nrow(v)){
if(v[i,1] > v[i-1,2]+2){v2difference[i] <- v[i,1]-v[i-1,2]-2}
}
v2difference[1] <- v[1,1]
v2 <- data.frame(Order=seq(1, 2*nrow(v), 2),Lengths=matrix(v2difference, ncol = 1),Motifs="na")
v1 <- data.frame(Order=seq(2, 2*nrow(v), 2),Lengths=(v$end-v$start+1),Motifs=na.omit(unlist(stri_extract_all_regex(f,motifs))))
V <- data.frame(Track=1,rbind(v1,v2))
V <- V[order(V$Order),]
B <- V[,!(names(V) %in% "Order")]
Max_end <- 33
breaks <- c(0, t(as.matrix(dat[,1:2])), Max_end) # get endpoints
breaks <- breaks[breaks <= Max_end]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all=T)
# start end motif
# 1 0 2 <NA>
# 2 2 6 a
# 3 6 10 <NA>
# 4 10 15 b
# 5 15 30 <NA>
# 6 30 33 <NA>
# 7 30 35 c
To specify a start and endpoint, you could do
Max_end <- 33
Max_start <- 10
breaks <- unique(c(Max_start, t(as.matrix(dat[,1:2])), Max_end))
breaks <- breaks[breaks <= Max_end & breaks >= Max_start]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all.y=T)
# start end motif
# 1 10 15 b
# 2 15 30 <NA>
# 3 30 33 <NA>
Note: this doesn't include "c" in the shortened final interval, you would need to decide if that values gets included or not when the interval changes.