Manipulate string in R - r

I'm looking to manipulate a set of strings in R.
The data I have:
Data Field
Mark Twain 5
I want it to instead be:
Data Field
Twain Mark 5
My idea was to first split the string into two columns and then concatenate. But I'm wondering if there is an easier way.

you can try this approach:
> df <- data.frame(Data=c("Mark Twain"), Field=5)
> df$Data <- lapply(strsplit(as.character(df$Data), " "), function(x) paste(rev(x), collapse=" "))
> df
Data Field
1 Twain Mark 5
This will work even if the number of rows in your data frame is > 1

we can use sub to do this
df1$Data <- sub("(\\S+)\\s+(\\S+)", "\\2 \\1", df1$Data)
df1
# Data Field
#1 Twain Mark 5
data
df1 <- structure(list(Data = "Mark Twain", Field = 5L),
.Names = c("Data", "Field"), class = "data.frame",
row.names = c(NA, -1L))

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

Select rows with certain values and update in R

I want to select rows that contain 'dm', something like like %dm% should be used, and then update them those rows by adding '*' in the beginning and the end of the string. An example would be:
id text
1 abc
2 admc
The output:
id text
1 abc
2 *admc*
You can identify the rows with "dm" using grep/grepl and change their values using paste0.
inds <- grep('dm', df$text)
df$text[inds] <- paste0('*', df$text[inds], '*')
df
# id text
#1 1 abc
#2 2 *admc*
Using data.table syntax avoids the creation of temporary variable (inds).
library(data.table)
setDT(df)[grep('dm', text), text := paste0('*', text, '*')]
data
df <- structure(list(id = 1:2, text = c("abc", "admc")),
class = "data.frame", row.names = c(NA, -2L))
Here is a tidyverse solution.
First, detect the presence of "dm" with stringr::str_detect
Then, using dplyr::if_else, if "dm" is present, prepend/append "*" using paste0; else, keep the text as-is
_
df %>%
mutate(text = if_else(str_detect(text, "dm"), paste0("*", text, "*"), text))
data
df <- read.table(text = "id text
1 abc
2 admc", header = TRUE, stringsAsFactors = FALSE)

subset df according nested list while there is a white space

I have a data frame and I would like to subset it according specific values. When I have tried to do it, there is problem because of the white space inside the values in sample_df$mentions.
I used this script for subsetting the data frame:
sample_list <- list()
for (i in colnames(sample_name)){
sample_list <- sapply(sample_df$mentions, function(x)any(x %in% sample_name[[i]]))
new_sample_df <- sample_df[sample_list,]
}
I have tried strsplit function to get rid of the space but it has created other problems.
sample_df$mentions <- strsplit(as.charater(sample_df$mentions),"[[:space:]]")
Thank you for your help in advance.
My expected outcome should be like this:
mentions screen_name
5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
sample_name reproducible data:
sample_name <- structure(list(Name = structure(2:1, .Label = c("hamzayerlikaya",
"SSSBBL777"), class = "factor")), row.names = c(NA, -2L), class = "data.frame")
sample_df reproducible data:
sample_df <- structure(list(mentions = list(character(0), "srgnsnmz92", character(0),
"Berivan_Aslan_", c("islambey1453", " hamzayerlikaya", " tahaayhan",
" hidoturkoglu15"), character(0), "themarginale", character(0),
character(0), c("nurhandnci", " SSSBBL777", " serkanacar007",
" Chequevera06", " kubilayy81")), screen_name = c("SaadetYakar",
"beraydogru", "EL_Turco_DLC", "hebunagel", "ak_Furkan54", "zaferakyol011",
"melmitem", "mobbingabla", "BekarKronik", "tanrica_gaia")), row.names = c(NA,
10L), class = "data.frame")
We can loop through the 'Name' and use that in grepl, Reduce it to a single logical vector and subset the rows of 'sample_df'
sample_df[Reduce(`|`, lapply(as.character(sample_name$Name),
grepl, x = sample_df$mentions)),]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
NOTE: This would work with any length of 'Name' column
Another option is regex_inner_join
library(fuzzyjoin)
library(tidyverse)
regex_inner_join(sample_df, sample_name, by = c("mentions" = "Name")) %>%
select(mentions, screen_name)
# mentions screen_name
#1 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#2 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
Since mentions is a list we can use sapply and select only those rows in sample_df where any of the mentions has Name in it.
sample_df[sapply(sample_df$mentions, function(x) any(grepl(pattern, x))), ]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
where pattern is
pattern = paste0("\\b", sample_name$Name, "\\b", collapse = "|")

Removing the special symbols in data.frame column values

I have two data frame each with a column Name
df1:
name
#one2
!iftwo
there_2_go
come&go
df1 = structure(list(name = c("#one2", "!iftwo", "there_2_go", "come&go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
df2:
name
One2
IfTwo#
there-2-go
come.go
df2 = structure(list(name = c("One2", "IfTwo#", "there-2-go", "come.go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
Now to compare the two data frames for inequality is cumbersome because of special symbols using %in%. To remove the special symbols using stringR can be useful. But how exactly we can use stringR functions with %in% and display the mismatch between them
have already done the mutate() to convert all in lowercasestoLower()as follows
df1<-mutate(df1,name=tolower(df1$name))
df2<-mutate(df2,name=tolower(df2$name))
Current output of comparison:
df2[!(df2 %in% df1),]
[1] "one2" "iftwo#" "there-2-go" "come.go"
Expected output as essentially the contents are same but with special symbols:
df2[!(df2 %in% df1),]
character(0)
Question : How do we ignore the symbols in the contents of the Frame
Here it is in a function,
f1 <- function(df1, df2){
i1 <- tolower(gsub('[[:punct:]]', '', df1$name))
i2 <- tolower(gsub('[[:punct:]]', '', df2$name))
d1 <- sapply(i1, function(i) grepl(paste(i2, collapse = '|'), i))
return(!d1)
}
f1(df, df2)
# one2 iftwo there2go comego
# FALSE FALSE FALSE FALSE
#or use it for indexing,
df2[f1(df, df2),]
#character(0)

R: Building a list from matching values in a data.frame

I have a 3 column data frame which looks a little like this:
id name links
1 134235 dave "34657","34563","23459"
2 23459 mary "134235","45868","45677"
3 165432 jane "134235","23459","44657"
where id and name values are unique, and links is a string of ids which indicate an association with some of the names in each row. So for example dave includes the links id 23459 which is mary so dave is connected to mary. What I need to produce is a pair list of all the connections in the data so with the example data I would output something like:
dave,mary
mary,dave
jane,dave
jane,mary
Very new to R and seen amazing things done with methods like apply and before going off and trying to replicate a solution which would look more like a javascript routine and be very inefficient I wondered if anyone could help.
One solution, using Matt's dput():
tab <- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
conns <- function(name, links) {
paste(name, tab$name[tab$id %in% as.numeric(unlist(strsplit(gsub('\'|\"',
'', links), ',')))], sep=',')
}
connections <- unname(unlist(mapply(conns, tab$name, tab$links,
SIMPLIFY=FALSE)))
The first step should be to normalize the data, in particular, parse the strings.
You can use ddply: it applies a function
that takes a chunk of a data.frame (a row, in our case)
and transforms it in some way. You just have to write a function
that works on one row, i.e., on one string.
# Sample data
n <- 10
k <- 3
ids <- as.character(unique(round(1e5*runif(n))))
n <- length(ids)
names <- LETTERS[1:n]
links <- lapply( ids, function(u)
sample(setdiff(ids,u),k,replace=FALSE) )
links <- sapply( links, function(u)
paste( '"', paste(u,collapse='","'), '"', sep="" ) )
d <- data.frame(
id=ids,
name=names,
links=links,
stringsAsFactors=FALSE
)
library(plyr)
library(stringr)
dd <- ddply(
d,
c("id", "name"),
function(u) data.frame(
id=u$id,
name=u$name,
link=unlist(str_split( str_replace_all( u$links, '"', '' ), "," ))
))
You can then join the data, either with merge or sqldf.
library(sqldf)
sqldf("
SELECT A.name, B.name
FROM dd AS A, d AS B
WHERE A.link = B.id
")
dat<- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
# It can all be done in base, of course...
library(stringr)
library(reshape2)
# This would be easy to do if links weren't in that format -
# one record per id-link pair would be preferable.
# Split dat$links and remove any quotes
dat.wider <- data.frame(
dat[ , c("id", "name")],
str_split_fixed(string = gsub(dat$links,
pattern = "['|\"]",
replace = ""),
pattern = ", ",
n = 3)
)
# Reshape
dat.long <- melt(dat.wider, id.var = c("id", "name"))
# Self-join - this is not quite the right method, but I'm just not
# thinking straight right now
dat.joined <- unique(merge(x = dat.long[ , c("name", "value")],
y = dat.long[ , c("id", "name")],
by.x = "value",
by.y = "id"
))
# And, finally, if you wanted vector output...
res <- with(dat.joined, paste(name.x, name.y, sep = ", "))

Resources