Remove acsii issue in a column - r

In a column like this:
data.frame(id = c(1), text = c("keep<U+0E1E>it"))
is there any way to use a gsub in column text to remove character which are inside this <> and remove also this <>
Expected output data.frame(id = c(1), text = c("keep it"))

Using stringr package:
library(stringr)
library(dplyr)
df <- data.frame(id = c(1), text = c("keep<U+0E1E>it"))
df
id text
1 1 keep<U+0E1E>it
df %>% mutate(text = str_remove(text, '<.*>'))
id text
1 1 keepit
Using gsub:
gsub('<.*>','',df$text)
[1] "keepit"

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

Move substring to end of string in R

I have a set of df with a large number of columns. The column names follow a pattern like so:
my.df <- data.frame(sentiment_brand1_1 = c(1,0,0,1), sentiment_brand1_2 = c(0,1,1,0),
sentiment_brand2_1 = c(1,1,1,1),
sentiment_brand2_2 = c(0,0,0,0),
brand1_rating_1 = c(1,2,3,4),
brand2_rating_1 = c(4,3,2,1))
I'd like to programmatically rename the columns, moving the substrings "brand1" and "brand2" from the middle of the column name to the end, e.g.:
desired_colnames <- c("sentiment_1_brand1",
"sentiment_2_brand1",
"sentiment_1_brand2",
"sentiment_2_brand2",
"rating_1_brand1",
"rating_1_brand2")
Capture the substring groups and rearrange in replacement
sub("(.*)_(brand1)(.*)", "\\1\\3_\\2", v1)
-output
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"
data
v1 <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")
## Input:
Test <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")
library("stringr")
paste(str_remove(Test, "_brand1"), "_brand1", sep = "")
## OutPut:
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"

Create a new column with word counter

In a dataframe like this:
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"))
How is it possible to add a new column in this dataframe where it will contain a word counter for every row using the text column?
Example of expected output
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"), counter = c(3, 4))
The separator is always the blank space between words
You can use str_count() from the stringr package to count the number of words using the \\w+ escape pattern.
df <- data.frame(
id = c(1,2),
alternativenum = c(342, 5921),
text = c("one text here","another text here also"),
stringsAsFactors = FALSE)
library(stringr)
library(dplyr)
df %>%
mutate(counter = str_count(df$text, pattern = "\\w+"))
id alternativenum text counter
1 1 342 one text here 3
2 2 5921 another text here also 4

Select rows with certain values and update in R

I want to select rows that contain 'dm', something like like %dm% should be used, and then update them those rows by adding '*' in the beginning and the end of the string. An example would be:
id text
1 abc
2 admc
The output:
id text
1 abc
2 *admc*
You can identify the rows with "dm" using grep/grepl and change their values using paste0.
inds <- grep('dm', df$text)
df$text[inds] <- paste0('*', df$text[inds], '*')
df
# id text
#1 1 abc
#2 2 *admc*
Using data.table syntax avoids the creation of temporary variable (inds).
library(data.table)
setDT(df)[grep('dm', text), text := paste0('*', text, '*')]
data
df <- structure(list(id = 1:2, text = c("abc", "admc")),
class = "data.frame", row.names = c(NA, -2L))
Here is a tidyverse solution.
First, detect the presence of "dm" with stringr::str_detect
Then, using dplyr::if_else, if "dm" is present, prepend/append "*" using paste0; else, keep the text as-is
_
df %>%
mutate(text = if_else(str_detect(text, "dm"), paste0("*", text, "*"), text))
data
df <- read.table(text = "id text
1 abc
2 admc", header = TRUE, stringsAsFactors = FALSE)

Preprocessing: text analysis on many columns from a dataframe

Using the following lines it is possible to preprocess text in a specific column of my dataframe:
#text to lower case
df$name <- tolower(df$name)
#remove all special characters
df$name <- gsub("[[:punct:]]", " ", df$name)
#remove long spaces
df$name <- gsub("\\s+"," ",str_trim(df$name))
I would like to implement this preprocessing rules in all columns (expect id) of a dataframe like this:
df <- data.frame(id = c("A","B","C"), D = c("mytext 11","mytext +", "!!"), E = c("text","stg","1.2"), F = c("press","remove","22"))
If you want to do something multiple times, it is often useful to define a function.
For example, you could do the following:
library(stringr)
df <- data.frame(id = c("A","B","C"), D = c("mytext 11","mytext +", "!!"),
E = c("text","stg","1.2"), F = c("press","remove","22"))
# create a function so we can apply this multiple times easily.
process <- function(my_vector)
{
my_vector <- tolower(my_vector)
#remove all special characters
my_vector <- gsub("[[:punct:]]", " ", my_vector)
#remove long spaces
my_vector <- gsub("\\s+"," ",str_trim(my_vector))
# return result
return(my_vector)
}
# for all columns except 'id', apply our function.
for(x in setdiff(colnames(df),"id"))
{
df[[x]]=process(df[[x]])
}
You can use dplyr::mutate_at() to mutate multiple columns; in this case, all columns except for id:
mydf %>%
mutate_at(.vars = vars(-id),
.funs = processText)
Where processText is a function containing your desired code:
processText <- function(str) {
str %>%
str_to_lower() %>%
str_replace_all(pattern="[[[:punct:]]]|[\\s+]", replacement=" ", .) %>%
str_trim()
}
The output is as follows:
id D E G
1 A mytext 11 text press
2 B mytext stg remove
3 C 1 2 22

Resources