split the superhero names with their ages in the given vector - r

Given this vector:
vector <- c("Superman1000", "Batman35", "Wonderwoman240")
I want to split the superhero's name and age.
df=data.frame(vector= c("Superman1000", "Batman35", "Wonderwoman240"))
library(stringr)
library(stringi)
library(dplyr)
df %>% separate(vector, c("A", "B"))
I tried this but it doesn't work.

If the data is same as shown, we can remove all the digits to get super hero name and remove all the non-digits to get their age.
df$super_hero <- gsub("\\D", "", df$vector)
df$super_hero_age <- gsub("\\d+", "", df$vector)
Or with tidyr::extract
tidyr::extract(df, vector, into = c("name", "age"),regex = "(.*\\D)(\\d+)")
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240
As mentioned by #A5C1D2H2I1M1N2O1R2T1, we can also use strcapture
strcapture("(.*\\D)(\\d+)", df$vector,
proto = data.frame(superhero = character(), age = integer()))

We can use read.csv from base R after creating a delimiter before the numeric part with sub
read.csv(text = sub("(\\d+)", ",\\1", df$vector), header = FALSE,
stringsAsFactors = FALSE, col.names = c('name', 'age'))
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240
Or another option is separate where we specify a regex lookaround
library(tidyr)
separate(df, vector, into = c("name", "age"), sep= "(?<=[a-z])(?=\\d)")
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240

Related

how to convert a price character vector to a numeric vector

I state that I am a neophyte.
I have a single column (character) dataframe on which I would like to find the minimum, maximum
and average price. The min () and max () functions also work with a character vector, but the mean
() or median () functions need a numeric vector. I have tried to change the comma with the period
but the problem becomes more complex when I have the prices in the thousands. How can I do?
>price
Price
1 1.651
2 2.229,00
3 1.899,00
4 2.160,50
5 1.709,00
6 1.723,86
7 1.770,99
8 1.774,90
9 1.949,00
10 1.764,12
This is the dataframe. I thank anyone who wants to help me in advance
Replace , with ., . with empty string and turn the values to numeric.
In base R using gsub -
df <- transform(df, Price = as.numeric(gsub(',', '.',
gsub('.', '', Price, fixed = TRUE), fixed = TRUE)))
# Price
#1 1651.00
#2 2229.00
#3 1899.00
#4 2160.50
#5 1709.00
#6 1723.86
#7 1770.99
#8 1774.90
#9 1949.00
#10 1764.12
You can also use parse_number number function from readr.
library(readr)
df$Price <- parse_number(df$Price,
locale = locale(grouping_mark = ".", decimal_mark = ','))
data
It is easier to help if you provide data in a reproducible format
df <- structure(list(Price = c("1.651", "2.229,00", "1.899,00", "2.160,50",
"1.709,00", "1.723,86", "1.770,99", "1.774,90", "1.949,00", "1.764,12"
)), class = "data.frame", row.names = c(NA, -10L))
url <- "https://www.shoppydoo.it/prezzi-notebook-mwp72t$2fa.html?src=user_search"
page <- read_html(url)
price <- page %>% html_nodes(".price") %>% html_text() %>% data.frame()
colnames(price) <- "Price"
price$Price <- gsub("da ", "", price$Price)
price$Price <-gsub("€", "", price$Price)
price$Price <-gsub(".", "", price$Price
)
We could use chartr in base R
df$Price <- with(df, as.numeric(sub(",", "", chartr('[.,]', '[,.]', df$Price))))
data
df <- structure(list(Price = c("1.651", "2.229,00", "1.899,00", "2.160,50",
"1.709,00", "1.723,86", "1.770,99", "1.774,90", "1.949,00", "1.764,12"
)), class = "data.frame", row.names = c(NA, -10L))

exclude part of a character string in a column in a dataframe

I have a df that has several ids separated with an underscore, like so:
df1:
id v1
1001 2
10002_10002 19
I want the underscore removed and anything after the underscore, like so:
df1:
id v1
1001 2
10002 19
I tried this code, but it's giving me a list, not a df. Can someone please help?
df2 <- strsplit(df1$id, split='_', fixed=TRUE)
You need to access the contents of the list, and then retain the first of two elements resulting from the split:
df1$id <- strsplit(df1$id, "_", fixed=TRUE)[[1]][1]
You could also use sub here:
df1$id <- sub("_.*$", "", df1$id)
Here a solution with the tidyverse/stringr:
library(tidyverse)
my_df <- data.frame(
stringsAsFactors = FALSE,
id = c("1001", "10002_10002"),
v1 = c(2L, 19L)
)
my_df %>%
mutate(id=str_remove(id, regex("(_.*)")))
#> id v1
#> 1 1001 2
#> 2 10002 19
Created on 2020-12-03 by the reprex package (v0.3.0)
I like to use the stringr package
require(stringr)
df1 <- data.frame(id = c("1001","1002_10002"), v1 = c(2,19))
df1$id <- str_remove(df1$id, pattern = "_.+")
We can use word from stringr
library(stringr)
library(dplyr)
df1 %>%
mutate(id = word(id, 1, sep="_"))
# id v1
#1 1001 2
#2 10002 19
Another option is trimws from base R
df1$id <- trimws(df1$id, whitespace = "_.*")
data
df1 <- structure(list(id = c("1001", "10002_10002"),
v1 = c(2L, 19L)), class = "data.frame", row.names = c(NA,
-2L))

Unite column names

I am trying to unite all columns of the dataframe df separating them with |.
However, for the name of the new column I would like to have all column names merged together separated the same way (eg S_n|S_s|S_b).
Here is what I tried and received error message Error: Must supply a symbol or a string as argument
S_n = c(2, 3, 5)
S_s = c("aa", "bb", "cc")
S_b = c(TRUE, FALSE, TRUE)
df = data.frame(S_n, S_s, S_b)
unite(df, S_n|S_s|S_b, sep="|", remove=TRUE)
unite(df, "S_n|S_s|S_b", sep="|", remove=TRUE). You need quotes around the column name because it's a non-standard name. (Standard names can't contain symbols other than . and _).
One idea via base R can be,
df[paste(names(df), collapse = '|')] <- do.call(paste, c(df, sep = '|'))
# S_n S_s S_b S_n|S_s|S_b
#1 2 aa TRUE 2|aa|TRUE
#2 3 bb FALSE 3|bb|FALSE
#3 5 cc TRUE 5|cc|TRUE

How to separate a column into threecolumns with R?

I have a data.frame where one of columns have such structure:
"2019-09-11 13:29:55:647 INFO".
How could I separate this column into three columns, where:
column 1 is :"2019-09-11 13:29:55"
column 2 is: "647"
column 3 is "INFO".
I want to use tidyr separate function but can't write a regular expression for separators.
We can use read.csv after inserting a delimiter
cbind(df1, read.csv(text = sub("^(\\S+) (\\S+):([^:]+)$",
"\\1,\\2,\\3", df1$datetime), col.names =c('newcol1', 'newcol2', 'newcol3'),
header = FALSE, stringsAsFactors = FALSE))
If we are using tidyverse, specify the sep with a regex lookaround, i.e. to match : followed by characters that are not a : till the end or the space between two digits
library(tidyr)
separate(df1, datetime, into = c('newcol1', 'newcol2', 'newcol3'),
sep="(?<=\\d) (?=\\d)|:(?=[^:]+$)")
# newcol1 newcol2 newcol3
#1 2019-09-11 13:29:55 647 INFO
Or with extract, capture the characters as a group till the last : followed by digits till the end of the string
extract(df1, datetime, into = c('newcol1', 'newcol2', 'newcol3'),
"^(\\S+)\\s(.*):([^:]+)$")
# newcol1 newcol2 newcol3
#1 2019-09-11 13:29:55 647 INFO
data
df1 <- data.frame(datetime = "2019-09-11 13:29:55:647 INFO",
stringsAsFactors = FALSE)

String Splitting a column into multiple columns

Hi I have question regarding R Programming, I am a newbie in R.
I have a dataset in excel with a particular column having values as such.
123456
123456789
123456789123
Now my requirement is to get values in multiples of 3 and split into different columns.
For eg. My first row would be splitting into 2 columns and second row into 3 columns
colA colB colC
123 456
123 456 789
The desired output:
Here are a few solutions. The first 5 do not use any packages. nc (number of columns) and cn (column names) defined in (1) are used in the others as well.
1) read.fwf Using the input DF shown reproducibly in the Note at the end count the maximum number of characters in a line and divide by 3 to get the number of columns nc. Next compute the column names cn. Finally use read.fwf to read them in. No packages are used.
nc <- max(nchar(DF[[1]]))/3
cn <- paste0("col", head(LETTERS, nc))
read.fwf(textConnection(as.character(DF[[1]])), rep(3, length = nc),
col.names = cn)
giving:
colA colB colC colD
1 123 456 NA NA
2 123 456 789 NA
3 123 456 789 123
2) formatC A variation on the above would be to use formatC to insert commas after every 3 characters giving the character vector ch and then read that in using read.csv.
ch <- formatC(DF[[1]], format= "f", digits = 0, big.mark = ",")
read.csv(text = ch, header = FALSE, col.names = cn)
3) strsplit Another variation would be to split the column using strsplit and the indicated regular expression to split by and then use toString to put the split components into a comma separated string vector, ch. Finally use read.csv as before.
ch <- sapply(strsplit(as.character(DF[[1]]), "(?<=...)", perl = TRUE), toString)
read.csv(text = ch, header = FALSE, col.names = cn)
4) gsub Yet another variation is to use gsub to insert commas every 3 characters and then use read.csv as in (2) and (3).
ch <- gsub("(...)(?=.)", "\\1,", DF[[1]], perl = TRUE)
read.csv(text = ch, header = FALSE, col.names = cn)
5) strcapture This one does not use any read.* routine. It also uses only base R.
strcapture(strrep("(...)?", nc), DF[[1]], setNames(double(nc), cn))
6) strapplyc This is the only variation that uses a package. strapplyc can be used to pick off successive 3 character subsets. It uses a simpler regular expression than some of our other solutions. read.csv is used as in some of the other solutions.
library(gsubfn)
ch <- sapply(strapplyc(DF[[1]], "..."), toString)
read.csv(text = ch, header = FALSE, col.names = cn)
Note
The input in reproducible form:
Lines <- "
123456
123456789
123456789123"
DF <- read.table(text = Lines)
Here is one option with separate
library(tidyverse)
df %>%
separate(a, into = c('b', 'c', 'd'), sep= c(3, 6), remove = FALSE)
# a b c d
#1 123 123
#2 123456 123 456
#3 123456789 123 456 789
Using convert=TRUE, changes the type of the column automatically
df %>%
separate(a, into = c('b', 'c', 'd'), sep= c(3, 6),
remove = FALSE, convert = TRUE)
data
df <- data.frame (a = c(123,123456,123456789))
using library data.table
library(data.table)
setDT(df1)
df1[, tstrsplit(df1$col1, "(?:.{3}+\\K)", perl = TRUE)] # change {3} to other numbers if you don't want to split after every 3.
# V1 V2 V3 V4
#1: 123 456 <NA> <NA>
#2: 123 456 789 <NA>
#3: 123 456 789 123
data:
df1<-
structure(list(col1 = c("123456", "123456789", "123456789123"
)), class = c("data.table", "data.frame"), row.names = c(NA, -3L))
There's probably a method that involves less repetition but one option may be
library(tidyverse)
df <- data.frame (a = c(123,123456,123456789))
df %>%
mutate(b = substr(a, 0,3),
c = substr(a, 4,6),
d = substr(a, 7,9))
a b c d
1 123 123
2 123456 123 456
3 123456789 123 456 789

Resources