the dataset that I work contains some numbers (usually up to 12) and I need to have all those numbers at the end:
# A tibble: 2 x 1
a
<chr>
1 THIS IS 1 AN EXAMPLE
2 THIS 2 IS AN EXAMPLE
I tried doing sth like this with gsub but it doesn't work as I want:
df <- df %>%
dplyr::mutate_at(.vars=vars(a), list(~ gsub(" (\\d) ", "\\2 \\1", .)))
Gives me this:
A tibble: 2 x 1
a
<chr>
1 THIS IS 1AN EXAMPLE
2 THIS 2IS AN EXAMPLE
What I want is: THIS IS AN EXAMPLE 1, THIS IS AN EXAMPLE 2.
How can I do this? Any help is appreciated!!
You can use gregexpr and regmatches.
s <- c("THIS IS 1 AN EXAMPLE", "THIS 2 IS AN EXAMPLE", "THIS 2 IS AN 3 EXAMPLE")
x <- gregexpr(" *\\d+", s)
y <- regmatches(s, x)
regmatches(s, x) <- ""
paste0(s, sapply(y, paste0, collapse = ""))
#[1] "THIS IS AN EXAMPLE 1" "THIS IS AN EXAMPLE 2" "THIS IS AN EXAMPLE 2 3"
With parse_number
library(readr)
library(dplyr)
df <- tibble(a = c("THIS IS 1 AN EXAMPLE", "THIS 2 IS AN EXAMPLE"))
df %>%
mutate(a = paste(sub("\\d+ ", "", a), parse_number(a)))
# A tibble: 2 × 1
a
<chr>
1 THIS IS AN EXAMPLE 1
2 THIS IS AN EXAMPLE 2
If you have more numbers using stringr
library(dplyr)
library(stringr)
df <- tibble(a = c("THIS IS 1 AN EXAMPLE", "THIS 2 IS AN EXAMPLE",
"THIS 223 IS AN 3 EXAMPLE"))
df %>%
mutate(a = paste(gsub("\\d+ ", "", a), sapply(a, function(x)
paste(str_extract_all(x, "\\d+")[[1]], collapse=" "))))
# A tibble: 3 × 1
a
<chr>
1 THIS IS AN EXAMPLE 1
2 THIS IS AN EXAMPLE 2
3 THIS IS AN EXAMPLE 223 3
Should be quite simple if you aim to detect all parts in gsub pattern using three separate brackets for pre-match, match and post-match parts:
library(tidyverse)
tibble(a = c("THIS IS 1 AN EXAMPLE", "THIS 2 IS AN EXAMPLE")) |>
mutate(a = gsub("(.*)( \\d )(.*)", "\\1 \\3\\2", a))
#> # A tibble: 2 × 1
#> a
#> <chr>
#> 1 "THIS IS AN EXAMPLE 1 "
#> 2 "THIS IS AN EXAMPLE 2 "
Using str_remove and str_extract is another option (easy to read/grasp):
library(stringr)
library(dplyr)
df |>
mutate(a = paste(str_remove(a, "\\d+ "), str_extract(a, "\\d+")))
Output:
# A tibble: 2 × 1
a
<chr>
1 THIS IS AN EXAMPLE 1
2 THIS IS AN EXAMPLE 2
Data:
df <-
tibble(a = c("THIS IS 1 AN EXAMPLE",
"THIS 2 IS AN EXAMPLE"))
Related
this is my first post here :)
So I encountered some weird behavior today: When using the dplyr mutate function together with the paste function, the outcome is the same for every row.
Here is an example:
vec1 <- c(2, 5)
vec2 <- c(4, 6)
test_df <- data.frame(vec1, vec2)
test_df %>% mutate(new_col = paste(vec1:vec2, collapse = ","))
with the output
vec1 vec2 new_col
1 2 4 2,3,4
2 5 6 2,3,4
but thats not what I wanted or expected.
Here is what I wanted, achieved with a loop:
df <- test_df %>% mutate(new_col = 1)
for(i in 1:nrow(test_df)){
df$new_col[i] <- paste(df$vec1[i]:df$vec2[i], collapse = ",")
}
With the output:
vec1 vec2 new_col
1 2 4 2,3,4
2 5 6 5,6
Whats going on and how can I achieve the same with mutate and paste?
We can get the sequence by loop over the vec1, vec2 elements with map2, and paste (str_c) the sequence values to a single string
library(dplyr)
library(purrr)
library(stringr)
test_df %>%
mutate(new_col = map2_chr(vec1, vec2, ~ str_c(.x:.y, collapse = ",")))
-output
vec1 vec2 new_col
1 2 4 2,3,4
2 5 6 5,6
Or with rowwise
test_df %>%
rowwise %>%
mutate(new_col = str_c(vec1:vec2, collapse = ",")) %>%
ungroup
# A tibble: 2 × 3
vec1 vec2 new_col
<dbl> <dbl> <chr>
1 2 4 2,3,4
2 5 6 5,6
I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?
We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b
Having a dataframe like this:
data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"))
How is it possible to detect if in the text1 column a row is blank and fill the blank with a value exist in text2, text3 or text4 column and leave from this NA after the process
Example of expected output
data.frame(id = c(1,2,3,4), text1 = c("sth","another","another","all"), text2 = c("more","","add",""), text3 = c("final","and","where",""))
A vectorized base R approach :
#Get indices where text1 is empty
inds <- which(df$text1 == '')
#get values to replace from the corresponding rows
vals <- cbind(inds, max.col(df[inds, 3:ncol(df)] != "") + 2)
#Replace the values
df$text1[inds] <- df[vals]
#Change the replaced value with blank.
df[vals] <- ''
df
# id text1 text2 text3
#1 1 sth more final
#2 2 another and
#3 3 another add where
#4 4 all
data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
In base R you could do:
txt <- do.call(paste,c(sep = ',',`is.na<-`(df,df=="")))
df1 <- read.csv(text = sub("((?:,NA)+)(,\\w+)","\\2\\1",txt),
header = FALSE,
col.names = names(df),
stringsAsFactors = FALSE)
df1[is.na(df1)] <- ""
df1
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
here is a data.table approach...
explanation in comments below
#sample data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
library( data.table )
#create data.table
setDT( df )
#paste together columns by id
ans <- df[, .(string = paste0( .SD, collapse =";")), by = .(id) ][]
# id string
# 1: 1 sth;more;final
# 2: 2 ;another;and
# 3: 3 another;add;where
# 4: 4 ;;all
#remove leading;'s
ans[, string := gsub("^;+", "", string) ]
# id string
# 1: 1 sth;more;final
# 2: 2 another;and
# 3: 3 another;add;where
# 4: 4 all
#split string back to columns, remove the temporary string-column
ans[, paste0( "text", 1:length( tstrsplit(ans$string, ";") ) ) :=
tstrsplit( string, ";") ][, string := NULL ]
# id text1 text2 text3
# 1: 1 sth more final
# 2: 2 another and <NA>
# 3: 3 another add where
# 4: 4 all <NA> <NA>
You can use dplyr + purrr:
df %>%
tidyr::nest(-id) %>%
dplyr::mutate(
new_text = purrr::map_chr(
data, ~
as.vector(t(.x[1,])) %>%
.[. != ""] %>%
dplyr::first())) %>%
tidyr::unnest()
A tibble: 4 x 5
id text1 text2 text3 new_text
<dbl> <fct> <fct> <fct> <chr>
1 1 sth more final sth
2 2 "" another and another
3 3 another add where another
4 4 "" "" all all
At this stage, why not also a dplyr approach? Admittedly, with a sparkle of base R in the middle
df <- data.frame(id = c(1,2,3,4),
text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"))
library("dplyr")
library("tidyr")
df_filled <- df %>%
pivot_longer(cols = starts_with("text"),
names_to = "text_id",
values_to = "value") %>%
mutate(value = as.character(value)) %>%
group_by(id) %>%
mutate(value = if_else(value=="", as.character(NA), value)) %>%
mutate(previously_missing = value) %>%
tidyr::fill(value, .direction = "downup")
df_filled$value[which(is.na(df_filled$previously_missing)&df_filled$text_id!="text3")+1] <- NA
df_filled %>%
ungroup() %>%
pivot_wider(id_cols = id,
names_from = "text_id",
values_from = "value")
#> # A tibble: 4 x 4
#> id text1 text2 text3
#> <dbl> <chr> <chr> <chr>
#> 1 1 sth more final
#> 2 2 another <NA> and
#> 3 3 another add where
#> 4 4 all <NA> <NA>
Created on 2020-02-19 by the reprex package (v0.3.0)
Another base R solution is to define your custom function swap and apply it by rows, i.e.,
swap <- function(v) {v[inds]<-v[rev(inds <- c(1,head(which(nchar(v)>0),1)))];v}
df[-1]<-t(apply(df[-1], 1, swap))
such that
> df
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Collapse / concatenate / aggregate a column to a single comma separated string within each group
(6 answers)
Closed 2 years ago.
I have a dataframe and I want to replicate the input of a single cell n times dependent on the input of the next cell and display it in a new cell.
My dataframe looks like this:
data <- data.frame(c(1,1,2,3,4,4,4), c("A","B","A","C","D","E","A"), c(2,1,1,3,2,1,3))
colnames(data) <- c("document number", "term", "count")
data
This is my desired result:
datanew <- data.frame(c(1,2,3,4), c("A A B", "A", "C C C", "D D E A A A"))
colnames(datanew) <- c("document number", "term")
# document number term
# 1 1 A A B
# 2 2 A
# 3 3 C C C
# 4 4 D D E A A A
So basically, I like to multiplicate the input of the term cell with the input of the corresponding count cell. Does anyone has an idea how to code it in R?
We can use rep to replicate term count times and paste the data together.
library(dplyr)
data %>%
group_by(`document number`) %>%
summarise(new = paste(rep(term, count), collapse = " "))
# A tibble: 4 x 2
# `document number` new
# <dbl> <chr>
#1 1 A A B
#2 2 A
#3 3 C C C
#4 4 D D E A A A
Similarly with data.table
library(data.table)
setDT(data)[, (new = paste(rep(term, count), collapse = " ")),
by = `document number`]
We can do this with tidyverse methods
library(dplyr)
library(tidyr)
library(stringr)
data %>%
uncount(count) %>%
group_by(`document number`) %>%
summarise(term = str_c(term, collapse=' '))
# A tibble: 4 x 2
# `document number` term
# <dbl> <chr>
#1 1 A A B
#2 2 A
#3 3 C C C
#4 4 D D E A A A
Or with data.table
library(data.table)
setDT(data)[rep(seq_len(.N), count)][, .(term =
paste(term, collapse=' ')), `document number`]
Or using base R with aggregate
aggregate(term ~ `document number`, data[rep(seq_len(nrow(data)),
data$count),], FUN = paste, collapse= ' ')
I need to define a function f(x,y) such that:
x = "col1,1,2,3,4"
y = "col2,a,b,c,d"
becomes:
# A tibble: 4 x 2
col1 col2
<int> <chr>
1 1 a
2 2 b
3 3 c
4 4 d
Any thoughts? Thanks.
The most obvious idea that comes to mind is to split the input by comma, use paste to combine the output into a single string, and read that using read_csv.
Example:
paste(do.call(paste, c(strsplit(c(x, y), ","), sep = ", ")), collapse = "\n")
# [1] "col1, col2\n1, a\n2, b\n3, c\n4, d"
library(tidyverse)
read_csv(paste(do.call(paste, c(strsplit(c(x, y), ","), sep = ", ")), collapse = "\n"))
# # A tibble: 4 x 2
# col1 col2
# <int> <chr>
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
From there, I hope you're able to convert the approach to a function.