Related
I have this type of data:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type.
I've tried this but it fails:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5],
function(x) paste(x[i], x[i+1], sep = " "))
The expected output is:
w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
I'd be most interested in a dplyr solution but am open and grateful for other solutions as well.
As you said you're most interested in a dplyr solution, this can be achieved using mutate() and across(). You can alter the function applied to each column if this doesn't achieve the exact desired output.
df %>%
mutate(
across(
# For the first four columns (i.e. has number 1-4 in column name)
matches("[1-4]"),
# Apply custom function
function(col) {
# Paste together
paste(
col, # the data in the current column
cur_data()[[which(names(cur_data()) == cur_column())+1]], # and the data in the next column along
sep = " "
)
},
.names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
)
) %>%
# EDIT: added to rename columns to match desired output
rename_with(.cols = matches("bigr"),
.fn = function(colname) {
paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname))+1)
})
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
library(tidyverse)
library(janitor)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
mutate(bigr = paste0(value, "_", lead(value))) %>%
mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>%
pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>%
remove_empty("cols") %>%
ungroup() %>%
select(-rn) %>%
rename_with(~str_remove(string = ., "value_")) %>%
rename_with(~str_replace(., "(_w)(\\d+)", "_\\2"))
#> # A tibble: 6 × 9
#> w1 w2 w3 w4 w5 bigr_1 bigr_2 bigr_3 bigr_4
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A B D E C A_B B_D D_E E_C
#> 2 B G S U <NA> B_G G_S S_U <NA>
#> 3 C C O <NA> <NA> C_C C_O <NA> <NA>
#> 4 E D F T <NA> E_D D_F F_T <NA>
#> 5 F E <NA> <NA> <NA> F_E <NA> <NA> <NA>
#> 6 G V N <NA> <NA> G_V V_N <NA> <NA>
Created on 2022-04-26 by the reprex package (v2.0.1)
As long as you don't have a string that is NA, you could try:
df %>%
mutate(across(-1,
~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
.names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
mutate(across(starts_with("bigr"),
~ if_else(str_count(., "NA") != 0, NA_character_, .)))
w1 w2 w3 w4 w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])
# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2] + 5
df[x] <- NA
df
# w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1 A B D E C A B B D D E E C
# 2 B G S U <NA> B G G S S U <NA>
# 3 C C O <NA> <NA> C C C O <NA> <NA>
# 4 E D F T <NA> E D D F F T <NA>
# 5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
# 6 G V N <NA> <NA> G V V N <NA> <NA>
We can use the tidytext package as follows:
df %>%
rowid_to_column() %>%
unite(col, -rowid, sep = ' ') %>%
tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
group_by(rowid) %>%
mutate(name = row_number()) %>%
pivot_wider(rowid, names_prefix = 'bgram_')
# A tibble: 6 x 5
# Groups: rowid [6]
rowid bgram_1 bgram_2 bgram_3 bgram_4
<int> <chr> <chr> <chr> <chr>
1 1 A B B D D E E C
2 2 B G G S S U U NA
3 3 C C C O O NA NA NA
4 4 E D D F F T T NA
5 5 F E E NA NA NA NA NA
6 6 G V V N N NA NA NA
using data.table
df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]
I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))
So I have 9 column
a b c d e f g h i
1 1 t p 1 h p 1 v g
2 2 e h 2 j m 2 c f
3 3 f g 3 k l 3 b d
and i want to know how can I make them like this
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d
We can use reshape from base R by specifying the columns to combine together in a list of vectors
out <- reshape(df1, direction = 'long',
varying = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')))[c('a', 'b', 'c')]
row.names(out) <- NULL
out
# a b c
#1 1 t p
#2 2 e h
#3 3 f g
#4 1 h p
#5 2 j m
#6 3 k l
#7 1 v g
#8 2 c f
#9 3 b d
Or using melt from data.table
library(data.table)
melt(setDT(df1), measure = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')), value.name = c('a', 'b', 'c'))[, variable := NULL][]
data
df1 <- structure(list(a = 1:3, b = c("t", "e", "f"), c = c("p", "h",
"g"), d = 1:3, e = c("h", "j", "k"), f = c("p", "m", "l"), g = 1:3,
h = c("v", "c", "b"), i = c("g", "f", "d")),
class = "data.frame", row.names = c("1",
"2", "3"))
One option involving purrr could be:
map_dfc(.x = split.default(df, rep(1:3, length.out = length(df))),
~ stack(.)[1]) %>%
setNames(c("a", "b", "c"))
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d
A data frame contains ID, group, n (numeric), and several factor variables
ID <- c(1,2,3,4,5,6,7,8,9,10)
group <- c("m", "m", "m", "f", "f", "m", "m", "f", "f", "m")
n <- c(1,2,6,3,6,8,4,1,4,2)
b1 <- c("a", "b", "", "a", "d", "d", "a", "c", "c", "b")
b2 <- c("a", "", "e", "a", "d", "d", "a", "c", "c", "b")
b3 <- c("a", "b", "", "a", "", "d", "a", "c", "c", "b")
b4 <- c("a", "b", "e", "a", "", "d", "a", "c", "c", "b")
b5 <- c("a", "b", "e", "a", "d", "", "", "", "c", "b")
b6 <- c("a", "", "", "", "d", "d", "", "c", "c", "b")
df <- data.frame(ID, group, n, b1, b2, b3, b4, b5, b6)
I need to create a new character column (call it y).
They way to compute y is by joining the first n variables (b1,b2,b3,b4,b5,b6) and use comma to seperate them.
Note, in case a column is a blank, then remove it from the join.
For example, for ID=1, y = "a"; for ID = 2, y = "b" (not "b, "); for ID = 3, y = "e,e,e", etc.
And, the faster the code, the better.
A possible sollution, the speed might still be an issue:
df$y <- sapply(seq_len(nrow(df)), function(i){
cvec <- head(unlist(df[i, 4:9]), df$n[i])
cvec <- cvec[!cvec == '']
paste(cvec, collapse = ',')
})
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b
Here is an option using gsub and paste. We paste the 'b' columns of 'df' (do.call(paste0, df[-(1:3)]), then use substring to keep only the characters that suggested by 'n' column, use gsub to create the , in between each character.
df$y <- gsub("(?<=\\S)(?=\\S)", ",",
substring(do.call(paste0, df[-(1:3)]), 1, df$n), perl = TRUE)
df
# ID group n b1 b2 b3 b4 b5 b6 y
#1 1 m 1 a a a a a a a
#2 2 m 2 b b b b b,b
#3 3 m 6 e e e e,e,e
#4 4 f 3 a a a a a a,a,a
#5 5 f 6 d d d d d,d,d,d
#6 6 m 8 d d d d d d,d,d,d,d
#7 7 m 4 a a a a a,a,a,a
#8 8 f 1 c c c c c c
#9 9 f 4 c c c c c c c,c,c,c
#10 10 m 2 b b b b b b b,b
df$y <- apply(df, 1, function(r) {
gsub("\\s+", "\\,", trimws(paste(head(r[4:9], r["n"]), sep= " ", collapse = " ")))})
df
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b
I have a data set like this.
a <- structure(list(Prone = c("M", "N", "N", "N", "M", "N", "M", "N", "M", "M"),
Type = c("A", "B", "C", "A", "A", "A", "B", "B", "C", "B"),
Alc = c("A", "B", "N", "A", "A", "A", "B", "B", "B", "B"),
Com = c("Y", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y")),
.Names = c("Prone", "Type", "Alc", "Com"), row.names = c(NA, -10L), class = "data.frame")
a
Prone Type Alc Com
1 M A A Y
2 N B B N
3 N C N Y
4 N A A Y
5 M A A Y
6 N A A Y
7 M B B Y
8 N B B N
9 M C B N
10 M B B Y
I like to get the frequency count of each unique row like the following:
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
Thanks in advance.
Alternate plyr solution:
> library("plyr")
> count(a)
Prone Type Alc Com freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
The mandatory data.table solution:
library(data.table)
dt = data.table(a)
dt[, list(Freq = .N), by = names(dt)]
There are a lot of ways to do this, here is a simple plyr example:
> library(plyr)
> ddply(a,names(a),summarize,Freq=length(Prone))
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
Using base aggregate:
aggregate(data = transform(a, Freq = seq_len(nrow(a))), Freq ~ ., length)
Prone Type Alc Com Freq
1 N B B N 2
2 M C B N 1
3 M A A Y 2
4 N A A Y 2
5 M B B Y 2
6 N C N Y 1
Here's another approach:
library(qdap)
colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## > colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## Prone Type Alc Com Freq
## 1 M A A Y 2
## 2 M B B Y 2
## 3 M C B N 1
## 4 N A A Y 2
## 5 N B B N 2
## 6 N C N Y 1