Create a new column with word counter - r

In a dataframe like this:
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"))
How is it possible to add a new column in this dataframe where it will contain a word counter for every row using the text column?
Example of expected output
data.frame(id = c(1,2), alternativenum = c(342, 5921), text = c("one text here","another text here also"), counter = c(3, 4))
The separator is always the blank space between words

You can use str_count() from the stringr package to count the number of words using the \\w+ escape pattern.
df <- data.frame(
id = c(1,2),
alternativenum = c(342, 5921),
text = c("one text here","another text here also"),
stringsAsFactors = FALSE)
library(stringr)
library(dplyr)
df %>%
mutate(counter = str_count(df$text, pattern = "\\w+"))
id alternativenum text counter
1 1 342 one text here 3
2 2 5921 another text here also 4

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

Move substring to end of string in R

I have a set of df with a large number of columns. The column names follow a pattern like so:
my.df <- data.frame(sentiment_brand1_1 = c(1,0,0,1), sentiment_brand1_2 = c(0,1,1,0),
sentiment_brand2_1 = c(1,1,1,1),
sentiment_brand2_2 = c(0,0,0,0),
brand1_rating_1 = c(1,2,3,4),
brand2_rating_1 = c(4,3,2,1))
I'd like to programmatically rename the columns, moving the substrings "brand1" and "brand2" from the middle of the column name to the end, e.g.:
desired_colnames <- c("sentiment_1_brand1",
"sentiment_2_brand1",
"sentiment_1_brand2",
"sentiment_2_brand2",
"rating_1_brand1",
"rating_1_brand2")
Capture the substring groups and rearrange in replacement
sub("(.*)_(brand1)(.*)", "\\1\\3_\\2", v1)
-output
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"
data
v1 <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")
## Input:
Test <- c("variable_brand1_1", "_brand1_stuff_1", "_brand1thing")
library("stringr")
paste(str_remove(Test, "_brand1"), "_brand1", sep = "")
## OutPut:
[1] "variable_1_brand1" "_stuff_1_brand1" "thing_brand1"

Remove acsii issue in a column

In a column like this:
data.frame(id = c(1), text = c("keep<U+0E1E>it"))
is there any way to use a gsub in column text to remove character which are inside this <> and remove also this <>
Expected output data.frame(id = c(1), text = c("keep it"))
Using stringr package:
library(stringr)
library(dplyr)
df <- data.frame(id = c(1), text = c("keep<U+0E1E>it"))
df
id text
1 1 keep<U+0E1E>it
df %>% mutate(text = str_remove(text, '<.*>'))
id text
1 1 keepit
Using gsub:
gsub('<.*>','',df$text)
[1] "keepit"

Select rows with certain values and update in R

I want to select rows that contain 'dm', something like like %dm% should be used, and then update them those rows by adding '*' in the beginning and the end of the string. An example would be:
id text
1 abc
2 admc
The output:
id text
1 abc
2 *admc*
You can identify the rows with "dm" using grep/grepl and change their values using paste0.
inds <- grep('dm', df$text)
df$text[inds] <- paste0('*', df$text[inds], '*')
df
# id text
#1 1 abc
#2 2 *admc*
Using data.table syntax avoids the creation of temporary variable (inds).
library(data.table)
setDT(df)[grep('dm', text), text := paste0('*', text, '*')]
data
df <- structure(list(id = 1:2, text = c("abc", "admc")),
class = "data.frame", row.names = c(NA, -2L))
Here is a tidyverse solution.
First, detect the presence of "dm" with stringr::str_detect
Then, using dplyr::if_else, if "dm" is present, prepend/append "*" using paste0; else, keep the text as-is
_
df %>%
mutate(text = if_else(str_detect(text, "dm"), paste0("*", text, "*"), text))
data
df <- read.table(text = "id text
1 abc
2 admc", header = TRUE, stringsAsFactors = FALSE)

Count the length of words and cut the rows which are under a threshold

Having data frame like this one:
data.frame(id = c(1,2,3), text = c("my text here", "another the here but different", "no text"))
How is it possible to cound for every row the number of words which has and cut the rows which have equal or less than 2 words?
Expected output
data.frame(id = c(1,2), text = c("my text here", "another the here but different"))
One option utilizing the stringr library could be:
df[!is.na(word(df$text, 3)), ]
id text
1 1 my text here
2 2 another the here but different
Or another option using the stringr library (provided by #Sotos):
df[str_count(df$text, fixed(" ")) >= 2, ]
Here is a base R solution using gregexpr() + lengths() + subset():
dfout <- subset(df,lengths(gregexpr("[[:alpha:]]+",df$text))>2)
such that
> dfout
id text
1 1 my text here
2 2 another the here but different
DATA
df <- structure(list(id = c(1, 2, 3), text = structure(c(2L, 1L, 3L
), .Label = c("another the here but different", "my text here",
"no text"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
You can use strsplit and lengths to find where you have more than 2 words.
df[lengths(strsplit(as.character(df$text), "\\b ")) > 2,]
# id text
#1 1 my text here
#2 2 another the here but different
df[lengths(strsplit(as.character(df$text), "\\W+")) > 2,] #Alternative
or using gregexpr:
df[lengths(gregexpr("\\W+", df$text)) > 1,]
id text
1 1 my text here
2 2 another the here but different
Have a look at Count the number of all words in a string.

Resources