I want to select rows that contain 'dm', something like like %dm% should be used, and then update them those rows by adding '*' in the beginning and the end of the string. An example would be:
id text
1 abc
2 admc
The output:
id text
1 abc
2 *admc*
You can identify the rows with "dm" using grep/grepl and change their values using paste0.
inds <- grep('dm', df$text)
df$text[inds] <- paste0('*', df$text[inds], '*')
df
# id text
#1 1 abc
#2 2 *admc*
Using data.table syntax avoids the creation of temporary variable (inds).
library(data.table)
setDT(df)[grep('dm', text), text := paste0('*', text, '*')]
data
df <- structure(list(id = 1:2, text = c("abc", "admc")),
class = "data.frame", row.names = c(NA, -2L))
Here is a tidyverse solution.
First, detect the presence of "dm" with stringr::str_detect
Then, using dplyr::if_else, if "dm" is present, prepend/append "*" using paste0; else, keep the text as-is
_
df %>%
mutate(text = if_else(str_detect(text, "dm"), paste0("*", text, "*"), text))
data
df <- read.table(text = "id text
1 abc
2 admc", header = TRUE, stringsAsFactors = FALSE)
Related
So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])
In a column like this:
data.frame(id = c(1), text = c("keep<U+0E1E>it"))
is there any way to use a gsub in column text to remove character which are inside this <> and remove also this <>
Expected output data.frame(id = c(1), text = c("keep it"))
Using stringr package:
library(stringr)
library(dplyr)
df <- data.frame(id = c(1), text = c("keep<U+0E1E>it"))
df
id text
1 1 keep<U+0E1E>it
df %>% mutate(text = str_remove(text, '<.*>'))
id text
1 1 keepit
Using gsub:
gsub('<.*>','',df$text)
[1] "keepit"
In a dataframe like this:
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"))
How is it possible to add a new column in this dataframe where it will contain a word counter for every row using the text column?
Example of expected output
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"), counter = c(3, 4))
The separator is always the blank space between words
You can use str_count() from the stringr package to count the number of words using the \\w+ escape pattern.
df <- data.frame(
id = c(1,2),
alternativenum = c(342, 5921),
text = c("one text here","another text here also"),
stringsAsFactors = FALSE)
library(stringr)
library(dplyr)
df %>%
mutate(counter = str_count(df$text, pattern = "\\w+"))
id alternativenum text counter
1 1 342 one text here 3
2 2 5921 another text here also 4
Having data frame like this one:
data.frame(id = c(1,2,3), text = c("my text here", "another the here but different", "no text"))
How is it possible to cound for every row the number of words which has and cut the rows which have equal or less than 2 words?
Expected output
data.frame(id = c(1,2), text = c("my text here", "another the here but different"))
One option utilizing the stringr library could be:
df[!is.na(word(df$text, 3)), ]
id text
1 1 my text here
2 2 another the here but different
Or another option using the stringr library (provided by #Sotos):
df[str_count(df$text, fixed(" ")) >= 2, ]
Here is a base R solution using gregexpr() + lengths() + subset():
dfout <- subset(df,lengths(gregexpr("[[:alpha:]]+",df$text))>2)
such that
> dfout
id text
1 1 my text here
2 2 another the here but different
DATA
df <- structure(list(id = c(1, 2, 3), text = structure(c(2L, 1L, 3L
), .Label = c("another the here but different", "my text here",
"no text"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
You can use strsplit and lengths to find where you have more than 2 words.
df[lengths(strsplit(as.character(df$text), "\\b ")) > 2,]
# id text
#1 1 my text here
#2 2 another the here but different
df[lengths(strsplit(as.character(df$text), "\\W+")) > 2,] #Alternative
or using gregexpr:
df[lengths(gregexpr("\\W+", df$text)) > 1,]
id text
1 1 my text here
2 2 another the here but different
Have a look at Count the number of all words in a string.
I'm looking to manipulate a set of strings in R.
The data I have:
Data Field
Mark Twain 5
I want it to instead be:
Data Field
Twain Mark 5
My idea was to first split the string into two columns and then concatenate. But I'm wondering if there is an easier way.
you can try this approach:
> df <- data.frame(Data=c("Mark Twain"), Field=5)
> df$Data <- lapply(strsplit(as.character(df$Data), " "), function(x) paste(rev(x), collapse=" "))
> df
Data Field
1 Twain Mark 5
This will work even if the number of rows in your data frame is > 1
we can use sub to do this
df1$Data <- sub("(\\S+)\\s+(\\S+)", "\\2 \\1", df1$Data)
df1
# Data Field
#1 Twain Mark 5
data
df1 <- structure(list(Data = "Mark Twain", Field = 5L),
.Names = c("Data", "Field"), class = "data.frame",
row.names = c(NA, -1L))