my dataframe look like this
x <-
data.frame(
id = c("123_1", "987_123")
)
I'd like to create this result dataframe via dplyr mutate function. I just want to take first part before underscore sign and another one right after underscore sign.
result <-
data.frame(
id = c("123_1", "987_123"),
af = c("123", "987"),
ad = c("1", "123")
)
1) tidyverse Use separate like this:
library(dplyr)
library(tidyr)
x %>%
separate(id, c("af", "ad"), remove = FALSE)
## id af ad
## 1 123_1 123 1
## 2 987_123 987 123
2) Base R
2a) read.table Without any packages use read.table
cbind(x, read.table(text = x$id, sep = "_", col.names = c("af", "ad"),
colClasses = "character"))
## id af ad
## 1 123_1 123 1
## 2 987_123 987 123
2b) sub or use sub:
transform(x, af = sub("_.*", "", id), ad = sub(".*_", "", id))
## id af ad
## 1 123_1 123 1
## 2 987_123 987 123
Related
I have a df that has several ids separated with an underscore, like so:
df1:
id v1
1001 2
10002_10002 19
I want the underscore removed and anything after the underscore, like so:
df1:
id v1
1001 2
10002 19
I tried this code, but it's giving me a list, not a df. Can someone please help?
df2 <- strsplit(df1$id, split='_', fixed=TRUE)
You need to access the contents of the list, and then retain the first of two elements resulting from the split:
df1$id <- strsplit(df1$id, "_", fixed=TRUE)[[1]][1]
You could also use sub here:
df1$id <- sub("_.*$", "", df1$id)
Here a solution with the tidyverse/stringr:
library(tidyverse)
my_df <- data.frame(
stringsAsFactors = FALSE,
id = c("1001", "10002_10002"),
v1 = c(2L, 19L)
)
my_df %>%
mutate(id=str_remove(id, regex("(_.*)")))
#> id v1
#> 1 1001 2
#> 2 10002 19
Created on 2020-12-03 by the reprex package (v0.3.0)
I like to use the stringr package
require(stringr)
df1 <- data.frame(id = c("1001","1002_10002"), v1 = c(2,19))
df1$id <- str_remove(df1$id, pattern = "_.+")
We can use word from stringr
library(stringr)
library(dplyr)
df1 %>%
mutate(id = word(id, 1, sep="_"))
# id v1
#1 1001 2
#2 10002 19
Another option is trimws from base R
df1$id <- trimws(df1$id, whitespace = "_.*")
data
df1 <- structure(list(id = c("1001", "10002_10002"),
v1 = c(2L, 19L)), class = "data.frame", row.names = c(NA,
-2L))
Given this vector:
vector <- c("Superman1000", "Batman35", "Wonderwoman240")
I want to split the superhero's name and age.
df=data.frame(vector= c("Superman1000", "Batman35", "Wonderwoman240"))
library(stringr)
library(stringi)
library(dplyr)
df %>% separate(vector, c("A", "B"))
I tried this but it doesn't work.
If the data is same as shown, we can remove all the digits to get super hero name and remove all the non-digits to get their age.
df$super_hero <- gsub("\\D", "", df$vector)
df$super_hero_age <- gsub("\\d+", "", df$vector)
Or with tidyr::extract
tidyr::extract(df, vector, into = c("name", "age"),regex = "(.*\\D)(\\d+)")
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240
As mentioned by #A5C1D2H2I1M1N2O1R2T1, we can also use strcapture
strcapture("(.*\\D)(\\d+)", df$vector,
proto = data.frame(superhero = character(), age = integer()))
We can use read.csv from base R after creating a delimiter before the numeric part with sub
read.csv(text = sub("(\\d+)", ",\\1", df$vector), header = FALSE,
stringsAsFactors = FALSE, col.names = c('name', 'age'))
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240
Or another option is separate where we specify a regex lookaround
library(tidyr)
separate(df, vector, into = c("name", "age"), sep= "(?<=[a-z])(?=\\d)")
# name age
#1 Superman 1000
#2 Batman 35
#3 Wonderwoman 240
I'm trying to find a way to lookup multiple values in a dataframe and return a value. Simplified example:
df1 <- read.table(text="chk1 chk2 chk3 value
xx aa;bb;cc jj 1
xx;yy dd;ee;ff kk 2
zz gg;hh;ii ll;nn 3", header=T)
df2 <- read.table(text="val1 val2 val3
xx bb jj
xx dd kk
yy ee kk
zz hh jj
", header=T)
Lookup values val1, val2, and val3 from df2 in df1, return value from df1.
Desired results:
df2 <- read.table(text="
val1 val2 val3 value
xx bb jj 1
xx dd kk 2
yy ee kk 2
zz hh jj NA
")
Tried match x %in% y and looping over the rows, can't get it to work.
Here is one possibility:
library(tidyverse)
df3 <- df2 %>% rowwise %>%
mutate(rowmatch=which(grepl(val1, df1$chk1) &
grepl(val2, df1$chk2) &
grepl(val3, df1$chk3))[1],
value=df1$value[rowmatch])
Result:
# A tibble: 4 x 5
val1 val2 val3 rowmatch value
<chr> <chr> <chr> <int> <int>
1 xx bb jj 1 1
2 xx dd kk 2 2
3 yy ee kk 2 2
4 zz hh jj NA NA
Notes:
the [1] is to ensure that only first of the matching rows is used.
note that although rowmatch and value are identical in this example this is only because df1$value is equal to the row number.
tibble behaves like a data.frame, but if you really prefer a data frame, add %>% as.data.frame
The same can be done with base R and apply:
df2$rowmatch <- with(df1, apply(df2, 1, function(x)
which(grepl(x["val1"], chk1) &
grepl(x["val2"], chk2) &
grepl(x["val3"], chk3))[1]))
df2$value <- df1$value[df2$rowmatch]
another option would be splitting the values first:
df1 <- df1 %>%
splitstackshape::cSplit("chk1", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk2", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk3", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE)
and then using join
You can also do it using two nested for loops. The logic is to take first row of df2 and then start going through rows of df1 to see if df2$val1 matches df1$chk, df2$val2 matches df1$chk2 and df2$val3 matches df1$chk3. I consider all values a match if there is at least one match per column. The caverat here is that if df2 does not have unique rows, the last matching row from df1 will be written to df2. But this can be changed by breaking out of the loop as soon as the match is found.
for (i in 1:nrow(df2)) {
for (j in 1:nrow(df1)) {
# Take i-th row and split by ;. Result is a vector of strings against
# which we'll use match.
i.split <- strsplit(as.character(unlist(df1[j, , drop = TRUE][-4])), ";")
# Pairwise check columns from df1 and df2.
all.ok <- all(mapply(FUN = function(x, y) {
any(x %in% y)
}, x = i.split, y = as.list(df2[i, 1:3])
))
if (all.ok) {
# If a match is found, write the value to df2.
df2[i, "value"] <- df1[j, "value"]
}
}
}
Output:
val1 val2 val3 value
1 xx bb jj 1
2 xx dd kk 2
3 yy ee kk 2
4 zz hh jj NA
Hi I have question regarding R Programming, I am a newbie in R.
I have a dataset in excel with a particular column having values as such.
123456
123456789
123456789123
Now my requirement is to get values in multiples of 3 and split into different columns.
For eg. My first row would be splitting into 2 columns and second row into 3 columns
colA colB colC
123 456
123 456 789
The desired output:
Here are a few solutions. The first 5 do not use any packages. nc (number of columns) and cn (column names) defined in (1) are used in the others as well.
1) read.fwf Using the input DF shown reproducibly in the Note at the end count the maximum number of characters in a line and divide by 3 to get the number of columns nc. Next compute the column names cn. Finally use read.fwf to read them in. No packages are used.
nc <- max(nchar(DF[[1]]))/3
cn <- paste0("col", head(LETTERS, nc))
read.fwf(textConnection(as.character(DF[[1]])), rep(3, length = nc),
col.names = cn)
giving:
colA colB colC colD
1 123 456 NA NA
2 123 456 789 NA
3 123 456 789 123
2) formatC A variation on the above would be to use formatC to insert commas after every 3 characters giving the character vector ch and then read that in using read.csv.
ch <- formatC(DF[[1]], format= "f", digits = 0, big.mark = ",")
read.csv(text = ch, header = FALSE, col.names = cn)
3) strsplit Another variation would be to split the column using strsplit and the indicated regular expression to split by and then use toString to put the split components into a comma separated string vector, ch. Finally use read.csv as before.
ch <- sapply(strsplit(as.character(DF[[1]]), "(?<=...)", perl = TRUE), toString)
read.csv(text = ch, header = FALSE, col.names = cn)
4) gsub Yet another variation is to use gsub to insert commas every 3 characters and then use read.csv as in (2) and (3).
ch <- gsub("(...)(?=.)", "\\1,", DF[[1]], perl = TRUE)
read.csv(text = ch, header = FALSE, col.names = cn)
5) strcapture This one does not use any read.* routine. It also uses only base R.
strcapture(strrep("(...)?", nc), DF[[1]], setNames(double(nc), cn))
6) strapplyc This is the only variation that uses a package. strapplyc can be used to pick off successive 3 character subsets. It uses a simpler regular expression than some of our other solutions. read.csv is used as in some of the other solutions.
library(gsubfn)
ch <- sapply(strapplyc(DF[[1]], "..."), toString)
read.csv(text = ch, header = FALSE, col.names = cn)
Note
The input in reproducible form:
Lines <- "
123456
123456789
123456789123"
DF <- read.table(text = Lines)
Here is one option with separate
library(tidyverse)
df %>%
separate(a, into = c('b', 'c', 'd'), sep= c(3, 6), remove = FALSE)
# a b c d
#1 123 123
#2 123456 123 456
#3 123456789 123 456 789
Using convert=TRUE, changes the type of the column automatically
df %>%
separate(a, into = c('b', 'c', 'd'), sep= c(3, 6),
remove = FALSE, convert = TRUE)
data
df <- data.frame (a = c(123,123456,123456789))
using library data.table
library(data.table)
setDT(df1)
df1[, tstrsplit(df1$col1, "(?:.{3}+\\K)", perl = TRUE)] # change {3} to other numbers if you don't want to split after every 3.
# V1 V2 V3 V4
#1: 123 456 <NA> <NA>
#2: 123 456 789 <NA>
#3: 123 456 789 123
data:
df1<-
structure(list(col1 = c("123456", "123456789", "123456789123"
)), class = c("data.table", "data.frame"), row.names = c(NA, -3L))
There's probably a method that involves less repetition but one option may be
library(tidyverse)
df <- data.frame (a = c(123,123456,123456789))
df %>%
mutate(b = substr(a, 0,3),
c = substr(a, 4,6),
d = substr(a, 7,9))
a b c d
1 123 123
2 123456 123 456
3 123456789 123 456 789
input:
tmp <- "(12,'chinese'),(13,'italian'),(14,'spanish')"
desired output:
id food_type
12 chinese
13 italian
14 spanish
I tried gsub/string split but realized that feeding into new row and column is another problem. thanks
Here is a solution based on eval(parse(text = )) by convert the string into an expression:
x <- eval(parse(text = paste0('list(', gsub('\\(', 'c\\(', tmp), ')')))
res <- as.data.frame(do.call(rbind, x), stringsAsFactors = FALSE)
names(res) <- c('id', 'food_type')
res
# id food_type
# 1 12 chinese
# 2 13 italian
# 3 14 spanish
Using strsplit and sub:
tmp <- "(12,'chinese'),(13,'italian'),(14,'spanish')"
terms <- strsplit(tmp, "(?<=\\)),(?=\\()", perl=TRUE)
df <- lapply(terms[[1]], function(x) {
id <- sub("^\\(([^,]*).*", "\\1", x)
food_type <- sub(".*,'(.*)'\\)", "\\1", x)
z <- c(id, food_type)
return(z)
})
df <- do.call(rbind.data.frame, df)
names(df) <- c("id", "food_type")
df
id food_type
1 12 chinese
2 13 italian
3 14 spanish
Demo
Hey checkout this solution i hope it will help you.
tmp1=gsub("\\'","",gsub("\\(","",unlist(strsplit(unlist(strsplit(tmp,",")),"\\)"))))
id=as.numeric(tmp1[seq(1,length(tmp2),2)])
fooditem=tmp1[seq(0,length(tmp2),2)]
res=data.frame(id,fooditem)
id fooditem
1 12 chinese
2 13 italian
3 14 spanish
I was looking for a way to change the input and use read.table function to get the desired output. The final steps turned out to be:
df <- lapply(strsplit(tmp, "\\(|\\)\\,?", perl = TRUE), function(x){
x <- x[x != ""]
read.table(text = paste0(x), sep = ",", header = FALSE, stringsAsFactors = FALSE)
})
df <- do.call(rbind, df)
names(df) <- c("id", "food_type")
# Result:
#> df
# id food_type
#1 12 chinese
#2 13 italian
#3 14 spanish