Create two column with multiple separators - r

I have a dataframe such as
COl1
scaffold_97606_2-BACs_-__SP1_1
UELV01165908.1_2-BACs_+__SP2_2
UXGC01046554.1_9-702_+__SP3_3
scaffold_12002_1087-1579_-__SP4_4
and I would like to separate both into two columns and get :
COL1 COL2
scaffold_97606 2-BACs_-__SP1_1
UELV01165908.1 2-BACs_+__SP2_2
UXGC01046554.1 9-702_+__SP3_3
scaffold_12002 1087-1579_-__SP4_4
so as you can see the separator changes it can be .Number_ or Number_Number
So far I wrote ;
df2 <- df1 %>%
separate(COL1, paste0('col', 1:2), sep = " the separator patterns ", extra = "merge")
but I do not know what separator I should use here in the " the separator patterns "part

You may use
> df1 %>%
separate(COl1, paste0('col', 1:2), sep = "(?<=\\d)_(?=\\d+-)", extra = "merge")
col1 col2
1 scaffold_97606 2-BACs_-__SP1_1
2 UELV01165908.1 2-BACs_+__SP2_2
3 UXGC01046554.1 9-702_+__SP3_3
4 scaffold_12002 1087-1579_-__SP4_4
See the regex demo
Pattern details
(?<=\d) - a positive lookbehind that requires a digit immediately to the left of the current location
_ - an underscore
(?=\d+-) - a positive lookahead that requires one or more digits and then a - immediately to the right of the current location.

You can use extract :
tidyr::extract(df, COl1, c('Col1', 'Col2'), regex = '(.*?\\d+)_(.*)')
# Col1 Col2
#1 scaffold_97606 2-BACs_-__SP1_1
#2 UELV01165908.1 2-BACs_+__SP2_2
#3 UXGC01046554.1 9-702_+__SP3_3
#4 scaffold_12002 1087-1579_-__SP4_4
data
df <- structure(list(COl1 = c("scaffold_97606_2-BACs_-__SP1_1",
"UELV01165908.1_2-BACs_+__SP2_2",
"UXGC01046554.1_9-702_+__SP3_3", "scaffold_12002_1087-1579_-__SP4_4"
)), class = "data.frame", row.names = c(NA, -4L))

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

how to remove quotation marks string before and after a string in R

I've tried several ways i found here but i haven't gotten the result i need, I need to be able to remove the " "" that appears on the first column and on the last column remove the " that appears at the end because the data base runs for several thousand the number of digits increases.
what is constant is the " "" on the first column and the " on the last column
db <- structure(list(`"1""Name` = c("\"2\"\"AAFC", "\"3\"\"Adfd",
"\"4\"\"Abbb"), `References"` = c("3\"", "4\"", "4\"")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
If we need to remove the leading/lagging ", use trimws with whitespace specifying the regex pattern
library(dplyr)
db1 <- db %>%
mutate(across(everything(), ~ trimws(., whitespace = '"')))
Or use str_remove_all to remove all the double quotes
library(stringr)
db1 <- db %>%
mutate(across(everything(), ~ str_remove_all(., '"')))
To remove all the occurrence of '"' from all the columns you can use lapply with gsub :
db[] <- lapply(db, function(x) gsub('"', '', x))
db
# A tibble: 3 x 2
# `"1""Name` `References"`
# <chr> <chr>
#1 2AAFC 3
#2 3Adfd 4
#3 4Abbb 4
If there are lot of columns and you want to do this only for selected columns we can subset those columns and pass to lapply. For example, for first and last column we can do :
cols <- c(1, ncol(db))
db[cols] <- lapply(db[cols], function(x) gsub('"', '', x))

Merge columns that are separated by character in order

I have a table that has two columns with information separated by ":". Te problem is that not all of them has the same size.
I'll write an example:
Col1 ol2
AA:BB:CC 1:2:3
AA:DD:BB:CC 4:5:6:7
And I would like a third column that is
Col3
AA=1:BB=2:CC=3
AA=4:DD=5:BB=6:CC=7
I've not idea where to start, I've try to split them, but it took me nowere
We can use strsplit to split the 'Col1', 'Col2' by :, then concatenate the corresponding list elements with str_c to create the 'Col3'
library(dplyr)
library(purrr)
library(stringr)
df1 %>%
mutate(col3 = map2_chr(strsplit(Col1, ":"), strsplit(Col2, ":"),
~ str_c(.x, .y, sep="=", collapse=':')))
# Col1 Col2 col3
#1 AA:BB:CC 1:2:3 AA=1:BB=2:CC=3
#2 AA:DD:BB:CC 4:5:6:7 AA=4:DD=5:BB=6:CC=7
data
df1 <- structure(list(Col1 = c("AA:BB:CC", "AA:DD:BB:CC"), Col2 = c("1:2:3",
"4:5:6:7")), class = "data.frame", row.names = c(NA, -2L))

Select rows with certain values and update in R

I want to select rows that contain 'dm', something like like %dm% should be used, and then update them those rows by adding '*' in the beginning and the end of the string. An example would be:
id text
1 abc
2 admc
The output:
id text
1 abc
2 *admc*
You can identify the rows with "dm" using grep/grepl and change their values using paste0.
inds <- grep('dm', df$text)
df$text[inds] <- paste0('*', df$text[inds], '*')
df
# id text
#1 1 abc
#2 2 *admc*
Using data.table syntax avoids the creation of temporary variable (inds).
library(data.table)
setDT(df)[grep('dm', text), text := paste0('*', text, '*')]
data
df <- structure(list(id = 1:2, text = c("abc", "admc")),
class = "data.frame", row.names = c(NA, -2L))
Here is a tidyverse solution.
First, detect the presence of "dm" with stringr::str_detect
Then, using dplyr::if_else, if "dm" is present, prepend/append "*" using paste0; else, keep the text as-is
_
df %>%
mutate(text = if_else(str_detect(text, "dm"), paste0("*", text, "*"), text))
data
df <- read.table(text = "id text
1 abc
2 admc", header = TRUE, stringsAsFactors = FALSE)

Remove unnecessary symbols in the data in R

That's my dataset
1.abc
2.def
3.2354
4.. $.?,
How can I delete those obs in which only digits, in which only symbols like point, comma ..., well, in which any symbols and digits(1#5??%).And words in the text where less than two letters
We can use str_count to count the number of characters and subset the dataset
library(stringr)
library(dplyr)
df1 %>%
filter(str_count(v1, "[[:alpha:]]") > 2)
Or with gsub to remove any character that is not a letter and count the number of characters with nchar to create a logical index for subsetting
subset(df1, nchar(gsub("[^[:alpha:]]+", "", v1))>2)
# v1
#1 1.abc
#2 2.def
data
df1 <- structure(list(v1 = c("1.abc", "2.def", "3.2354", "4.. $.?,")),
.Names = "v1", class = "data.frame", row.names = c(NA, -4L))

Resources