I have this dataset in R:
library(stringr)
set.seed(999)
col1 = sample.int(5, 100, replace = TRUE)
col2 = sample.int(5, 100, replace = TRUE)
col3 = sample.int(5, 100, replace = TRUE)
col4 = sample.int(5, 100, replace = TRUE)
col5 = sample.int(5, 100, replace = TRUE)
col6 = sample.int(5, 100, replace = TRUE)
col7 = sample.int(5, 100, replace = TRUE)
col8 = sample.int(5, 100, replace = TRUE)
col9 = sample.int(5, 100, replace = TRUE)
col10 = sample.int(5, 100, replace = TRUE)
d = data.frame(id = 1:10, seq = c(paste(col1, collapse = ""), paste(col2, collapse = ""), paste(col3, collapse = ""), paste(col4, collapse = ""), paste(col5, collapse = ""), paste(col6, collapse = ""), paste(col7, collapse = ""), paste(col8, collapse = ""), paste(col9, collapse = ""), paste(col10, collapse = "")))
For each row, I would like to create new variables:
d$most_common: the most common element in each row
d$second_most_common: the second most common element in each row
d$third_most_common: the third most common element in each row
I tried to do this with the following function (Find the most frequent value by row):
rowMode <- function(x, ties = NULL, include.na = FALSE) {
# input checks data
if ( !(is.matrix(x) | is.data.frame(x)) ) {
stop("Your data is not a matrix or a data.frame.")
}
# input checks ties method
if ( !is.null(ties) && !(ties %in% c("random", "first", "last")) ) {
stop("Your ties method is not one of 'random', 'first' or 'last'.")
}
# set ties method to 'random' if not specified
if ( is.null(ties) ) ties <- "random"
# create row frequency table
rft <- table(c(row(x)), unlist(x), useNA = c("no","ifany")[1L + include.na])
# get the mode for each row
colnames(rft)[max.col(rft, ties.method = ties)]
}
rowMode(d[1,1])
This gave me an error:
Error in rowMode(d[1, 1]) : Your data is not a matrix or a data.frame.
Which is a bit confusing, seeing as "d" is a data.frame.
Is there an easier way to do this?
Thank you!
You can do this by splitting the long string on each character, pivoting longer, and counting instances by id and character, and taking the top 3..
Here is an approach using data.table
library(data.table)
setDT(d)
melt(d[, tstrsplit(seq,""), id], id.vars = "id")[, .N, .(id, value)][order(-N), .SD[1:3][,nth:=.I], id]
Output (first six rows of 30):
id value N nth
1: 2 2 30 1
2: 2 1 22 2
3: 2 4 19 3
4: 3 3 28 1
5: 3 2 23 2
6: 3 4 20 3
Here is a similar approach using dplyr with unnest() to make long:
d %>%
group_by(id) %>%
mutate(chars = strsplit(seq,"")) %>%
unnest(chars) %>%
count(id, chars,sort = T) %>%
slice_head(n=3)
Output:
id chars n
<int> <chr> <int>
1 1 1 24
2 1 5 20
3 1 2 19
4 2 2 30
5 2 1 22
6 2 4 19
7 3 3 28
8 3 2 23
9 3 4 20
10 4 1 26
If you need the variables "Most_common", "second_most":
You can use: mutate & str_count which counts each character in a string
library dplyr
#range
r <- 1:5 |> as.character()
d |>
group_by(id) |>
mutate(most_common = which(unique(str_count(seq, r)) == last(sort(str_count(seq, r)))),
second_most_common = first(which(str_count(seq, r) == nth(sort(str_count(seq, r)), length(r) - 1))),
third_most_common = first(which(str_count(seq, r) == nth(sort(str_count(seq, r)), length(r) - 2))))
id seq most_common second_most_com… third_most_comm…
<int> <chr> <int> <int> <int>
1 1 3451122353321532415512241532113224441251251254542314534141431523132515542431525553… 1 5 2
2 2 1432431521432121553144243252433424314222143112242423421524144222151123234314255321… 2 1 4
3 3 4232245131422525453443332555312143535325221555344453323342533222344112134311342335… 3 2 4
4 4 4252525524252335331144111244343534224454131341553141342131354215143133213214314241… 1 3 4
5 5 2223513245222513345115334422121115412343225125312335414233115453235322543311352331… 3 2 1
6 6 3244331444151221411123513334135553324122122233134315145451545423111325253225325141… 1 1 2
7 7 4353332532552141211553131123521145214552211231144155553152131124221522333222343355… 5 1 3
8 8 1432215433134223221222143432454314232514255344213444342235252213324245413213554121… 2 4 3
9 9 2335142431432434123121254343455134511124323335211514354553145531115232541551252421… 1 1 3
10 10 1552245312213342315524134513123511112311314321112334533252141242212345432435421535… 1 3 2
Related
I have
x<-"1, A | 2, B | 10, C "
x is always this way formatted, | denotes a new row and the first value is the variable1, the second value is variable2.
I would like to convert it to a data.frame
variable1 variable2
1 1 A
2 2 B
3 10 C
I haven't found any package that can understand the escape character |
How can I convert it to data.frame?
We may use read.table from base R to read the string into two columns after replacing the | with \n
read.table(text = gsub("|", "\n", x, fixed = TRUE), sep=",",
header = FALSE, col.names = c("variable1", "variable2"), strip.white = TRUE )
-output
variable1 variable2
1 1 A
2 2 B
3 10 C
Or use fread from data.table
library(data.table)
fread(gsub("|", "\n", x, fixed = TRUE), col.names = c("variable1", "variable2"))
variable1 variable2
1: 1 A
2: 2 B
3: 10 C
Or using tidyverse - separate_rows to split the column and then create two columns with separate
library(tidyr)
library(dplyr)
tibble(x = trimws(x)) %>%
separate_rows(x, sep = "\\s*\\|\\s*") %>%
separate(x, into = c("variable1", "variable2"), sep=",\\s+", convert = TRUE)
# A tibble: 3 × 2
variable1 variable2
<int> <chr>
1 1 A
2 2 B
3 10 C
Here's a way using scan().
x <- "1, A | 2, B | 10, C "
do.call(rbind.data.frame,
strsplit(scan(text=x, what="A", sep='|', quiet=T, strip.white=T), ', ')) |>
setNames(c('variable1', 'variable2'))
# variable1 variable2
# 1 1 A
# 2 2 B
# 3 10 C
Note: R version 4.1.2 (2021-11-01).
I have the below dataframe xo. For each row, I want to find and replace the positions listed in positions_of_Ns_to_remove in sequence. The results new variable in the example should be sequence with all R's removed. I cannot search based on the character itself in this situation - it must be based on the position of the character.
p <- data.frame(locus = c("1","2","3"), positions_of_Ns_to_remove = c("12,17,43,100","30,60,61,62",NA))
x <- data.frame(locus = c("1","1","2","3"), sequence = c("xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"))
xo <- merge(x, p, by = c("locus"), all.x = T)
> xo
locus sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
This works if there is only 1 row in xo, but not when there are multiple rows. I would like to use tidyverse functions / piping and avoid for loops if possible.
xo %>% dplyr::mutate(new_sequence = paste(
replace( unlist(strsplit(sequence, "")), as.integer(unlist(strsplit(positions_of_Ns_to_remove,","))), "" ),
collapse = "")
)
What I want:
locus new_sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
2 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
You could build a custom function and apply it to your data:
library(stringr)
# cuts the n-th character out of the string
remove_pos <- function(string, n) {
n <- as.integer(n)
n <- n[order(n, decreasing = TRUE)]
len <- nchar(string)
output <- string
for (i in n) {
output <- paste0(
str_sub(output, start = 1L, end = i - 1L),
str_sub(output, start = i + 1, end = len)
)
}
return(output)
}
xo %>%
mutate(positions = str_split(positions_of_Ns_to_remove, ",")) %>%
group_by(locus, n=row_number()) %>%
mutate(
new_seq = ifelse(!is.na(positions_of_Ns_to_remove),
remove_pos(sequence, unlist(positions)),
sequence)
) %>%
select(-positions) %>%
ungroup()
which returns
# A tibble: 5 x 4
locus sequence positions_of_Ns_to~ new_seq
<chr> <chr> <chr> <chr>
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxx~ 30,60,61,62 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
4 3 Rxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
5 4 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ NA xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
From this dataframe:
dftest <- data.frame(id = c(1), text = c("java-ee?jsf?omnifaces?jpa"), stringsAsFactors = F)
I would like to produce a dataframe like this
data.frame(id = c(1), java-ee = c(1), jsf = c(1), onifaces = c(1), jpa = c(1))
I use this commands to make it:
s2 <- strsplit(dftest$text, split = "?")
dftest2 <- data.frame(id = rep(dftest2$id, sapply(s2, length)), text = unlist(s2))
dflike_final <- reshape(dftest2, idvar = "id", timevar = "text", direction = "wide")
Howver the results from the first two line is this:
id text
1 1 j
2 1 a
3 1 v
4 1 a
5 1 -
6 1 e
7 1 e
8 1 ?
9 1 j
10 1 s
11 1 f
12 1 ?
13 1 o
14 1 m
15 1 n
16 1 i
17 1 f
18 1 a
19 1 c
20 1 e
21 1 s
22 1 ?
23 1 j
24 1 p
25 1 a
How can I fix it to have the whole string?
We can bring the text in separate rows, create a dummy column (n) and get the data in wide format using pivot_wider.
library(dplyr)
library(tidyr)
dftest %>%
separate_rows(text, sep = "\\?") %>%
mutate(n = 1) %>%
pivot_wider(values_from = n, names_from = text)
# A tibble: 1 x 5
# id `java-ee` jsf omnifaces jpa
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1 1
As mentioned by #Roland ? is a special character in regex we need to escape it. Also you need to include a dummy column in creating the new dataframe. You can then use your attempt as
s2 <- strsplit(dftest$text, split = "\\?")
dftest2 <- data.frame(id = rep(dftest$id, lengths(s2)), text = unlist(s2), n = 1)
dflike_final <- reshape(dftest2, idvar = "id", timevar = "text", direction = "wide")
I have the following data table in R:
set.seed(5)
my_data <- data.table(cat_1=c(1,1,1,2,2,1,1,1,3,4,5,4,5),
cat_2 = sample(c("A","B"), 13, replace = T),
ao = rnorm(13,500,10))
And I would like to know the number of rows per cat_1, the sum of ao per cat_1, and the number of As in cat_2 per cat_1. So ideally I would like to get this:
merge(my_data[, .(cat1_lines = .N, total_ao = sum(ao, na.rm = T)), by = cat_1],
my_data[cat_2 == "A", .(A_lines = .N), by = cat_1], by = "cat_1", all.x = T)
cat_1 cat1_lines total_ao A_lines
1: 1 6 3015.5034 1
2: 2 2 1015.8838 2
3: 3 1 516.9518 NA
4: 4 2 984.0768 2
5: 5 2 983.8361 2
Is there a way of doing this in the same by statement without having to merge? Something like (I know this does not work):
my_data[, .(cat1_lines = .N, A_lines = .N[cat_2 == "A"],
total_ao = sum(ao, na.rm = T)), by = cat_1]
You can easily do this with a by statement in your data.table. Try this:
my_data[,.(cat1_lines=.N,total_ao=sum(ao),A_lines=sum(cat_2=="A")),by=.(cat_1)]
cat_1 cat1_lines total_ao A_lines
1: 1 6 3015.5034 1
2: 2 2 1015.8838 2
3: 3 1 516.9518 0
4: 4 2 984.0768 2
5: 5 2 983.8361 2