multicolumn csv text into presence absence matri - r

I have a csv file like this:
col1 col2 col3
r1 a,b,c e,f g
r2 h,i j,k
r3 l m,n,o
some cells have multiple text comma separated, some have single and some have none.I want to convert this like:
col1 col2 col3
a 1 0 0
b 1 0 0
c 1 0 0
e 0 1 0
f 0 1 0
g 0 0 1
h 1 0 0
i 1 0 0
j 0 0 1
k 0 0 1
l 1 0 0
m 0 1 0
n 0 1 0
o 0 1 0
Any suggestion? I tried pivot table in excel but not getting the desired output.
Thanks in advance.
Best Regards
Zillur

not sure whether this is the shortest solution (probably not) but it produces the desired output. Basically, we go through all three columns and count the occurrences of the strings and get a long format data frame that we then flip to the wide format you want.
library(tidyr)
library(purrr)
df <- data_frame(col1 = c("a,b,c", "h,i", "l"),
col2 = c("e,f", "", "m,n,o"),
col3 = c("g", "j,k", ""))
let_df <- map_df(df, function(col){
# map_df applies the function to each column of df
# split strings at "," and unlist to get vector of letters
letters <- unlist(str_split(col, ","))
# delete ""
letters <- letters[nchar(letters) > 0]
# count occurrences for each letter
tab <- table(letters)
# replace with 1 if occurs more often
tab[tab > 1] <- 1
# create data frame from table
df <- data_frame(letter = names(tab), count = tab)
return(df)
}, .id = "col") # id adds a column col that contains col1 - col3
# bring data frame into wide format
let_df %>%
spread(col, count, fill = 0)

Such a great problem to solve. Here is my take on it in base R:
col1 <- c("a,b,c","h,i","l")
col2 <- c("e,f","","m,n,o")
col3 <- c("g","j,k","")
data <- data.frame(col1, col2, col3, stringsAsFactors = F)
restructure <- function(df){
df[df==""] <- "missing"
result_rows <- as.character()
l <- list()
for (i in seq_along(colnames(df)) ){
df_col <- sort(unique(unlist(strsplit(gsub(" ", "",toString(df[[i]])), ","))))
df_col <- df_col[!df_col %in% "missing"]
result_rows <- sort(unique(c(result_rows, df_col)))
l[i] <- list(df_col)
}
result <- data.frame(result_rows)
for (j in seq_along(l)){
result$temp <- NA
result$temp[match(l[[j]], result_rows)] <- 1
colnames(result)[colnames(result)=="temp"] <- colnames(df)[j]
}
result[is.na(result)] <- 0
return(result)
}
> restructure(data)
# result_rows col1 col2 col3
#1 a 1 0 0
#2 b 1 0 0
#3 c 1 0 0
#4 e 0 1 0
#5 f 0 1 0
#6 g 0 0 1
#7 h 1 0 0
#8 i 1 0 0
#9 j 0 0 1
#10 k 0 0 1
#11 l 1 0 0
#12 m 0 1 0
#13 n 0 1 0
#14 o 0 1 0

Related

find peaks and valleys for each column of a dataframe

I have a dataframe df, I would like to find peaks and valleys for each column and then replace the points where peaks and valleys are present with the value 1.
Here I made an example by applying it to only one column.
Is it possible to do this for all the columns in the dataframe?
df <- data.frame(a = sample(1:10,10),
b = sample(1:10,10),
c = sample(1:10,10),
d = sample(1:10,10),
e = sample(1:10,10))
vallys<- findValleys(df$b, thresh =0)
peaks <- findPeaks(df$b, thresh = 0)
df$b <- rep(0, nrow(df))
df$b <- replace(df$b, peaks, values=1)
df$b <- replace(df$b, vallys, values=1)
Thank you
The easiest thing is to put your code into a function.
library(quantmod)
replace_peaks_valleys <- function(x) {
valleys <- findValleys(x, thresh = 0)
peaks <- findPeaks(x, thresh = 0)
new_col <- rep(0, length(x))
new_col <- replace(new_col, peaks, values = 1)
new_col <- replace(new_col, valleys, values = 1)
return(new_col)
}
Then you can choose whether to do it in base R, dplyr or data.table.
base R
As you want to assign back to your original data frame, in base R you can do (note the square brackets or it will return a list):
df[] <- lapply(df, replace_peaks_valleys)
head(df)
# a b c d e
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 1 1 1 1 1
# 4 1 0 1 1 0
# 5 1 1 0 1 0
# 6 0 1 1 1 1
dplyr
Alternatively, with dplyr you can just do:
library(dplyr)
df |>
mutate(
across(
a:e, replace_peaks_valleys
)
)
# a b c d e
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 1 1 1 1 1
# 4 1 0 1 1 0
# <etc>
data.table
You can also do this with data.table:
library(data.table)
dt <- setDT(df)
dt[, lapply(.SD, replace_peaks_valleys)]
# a b c d e
# 1: 0 0 0 0 0
# 2: 0 0 0 0 0
# 3: 1 0 1 1 1
# 4: 1 1 0 0 0
# <etc>
N.B. I used set.seed(1) before I ran your code - if you do this as well you should exactly the same output.
Function definition
I just copied and pasted your code and made it into a function. You could change it so you assign 0 or 1 to the existing vector, rather than creating a new vector every time:
replace_peaks_valleys2 <- function(x) {
valleys <- findValleys(x, thresh = 0)
peaks <- findPeaks(x, thresh = 0)
x[] <- 0
x[c(peaks,valleys)] <- 1
return(x)
}

Transforming dataset

I have a dataset that looks like below that I want to trasnsform to another format that assign true/false based on whether certain string is present. What's the best way to do it either in Excel or R?
Thanks!
Initial dataset:
Row1 A D
Row2 B C
Row3 A C E
The format I want:
A B C D E
Row1 1 0 0 1 0
Row2 0 1 1 0 0
Row3 1 0 1 0 1
Here is a base R way with lapply and xtabs.
I assume that filename holds the data file name.
x <- readLines(filename)
x <- strsplit(x, " ")
l <- lapply(x, \(y) {
values <- y[-1]
rows <- rep(y[1], length(values))
data.frame(rows, values)
})
df1 <- do.call(rbind, l)
rm(x, l)
xtabs(~ rows + values, df1)
#> values
#> rows A B C D E
#> Row1 1 0 0 1 0
#> Row2 0 1 1 0 0
#> Row3 1 0 1 0 1
Created on 2022-09-08 by the reprex package (v2.0.1)

extracting unique combinations from a long list of binary variables

I have a dataframe containing a long list of binary variables. Each row represents a participant, and columns represent whether a participant made a certain choice (1) or not (0). For the sakes of simplicity, let's say there's only four binary variables and 6 participants.
df <- data.frame(a = c(0,1,0,1,0,1),
b = c(1,1,1,1,0,1),
c = c(0,0,0,1,1,1),
d = c(1,1,0,0,0,0))
>df
# a b c d
# 1 0 1 0 1
# 2 1 1 0 1
# 3 0 1 0 0
# 4 1 1 1 0
# 5 0 0 1 0
# 6 1 1 1 0
In the dataframe, I want to create a list of columns that reflect each unique combination of variables in df (i.e., abc, abd, bcd, cda). Then, for each row, I want to add value "1" if the row contains the particular combination corresponding to the column. So, if the participant scored 1 on "a", "b", and "c", and 0 on "d" he would have a score 1 in the newly created column "abc", but 0 in the other columns. Ideally, it would look something like this.
>df_updated
# a b c d abc abd bcd cda
# 1 0 1 0 1 0 0 0 0
# 2 1 1 0 1 0 1 0 0
# 3 0 1 0 0 0 0 0 0
# 4 1 1 1 0 1 0 0 0
# 5 0 0 1 0 0 0 0 0
# 6 1 1 1 0 0 0 0 0
The ultimate goal is to have an idea of the frequency of each of the combinations, so I can order them from the most frequently chosen to the least frequently chosen. I've been thinking about this issue for days now, but couldn't find an appropriate answer. I would very much appreciate the help.
Something like this?
funCombn <- function(data){
f <- function(x, data){
data <- data[x]
list(
name = paste(x, collapse = ""),
vec = apply(data, 1, function(x) +all(as.logical(x)))
)
}
res <- combn(names(df), 3, f, simplify = FALSE, data = df)
out <- do.call(cbind.data.frame, lapply(res, '[[', 'vec'))
names(out) <- sapply(res, '[[', 'name')
cbind(data, out)
}
funCombn(df)
# a b c d abc abd acd bcd
#1 0 1 0 1 0 0 0 0
#2 1 1 0 1 0 1 0 0
#3 0 1 0 0 0 0 0 0
#4 1 1 1 0 1 0 0 0
#5 0 0 1 0 0 0 0 0
#6 1 1 1 0 1 0 0 0
Base R option using combn :
n <- 3
cbind(df, do.call(cbind, combn(names(df), n, function(x) {
setNames(data.frame(as.integer(rowSums(df[x] == 1) == n)),
paste0(x, collapse = ''))
}, simplify = FALSE))) -> result
result
# a b c d abc abd acd bcd
#1 0 1 0 1 0 0 0 0
#2 1 1 0 1 0 1 0 0
#3 0 1 0 0 0 0 0 0
#4 1 1 1 0 1 0 0 0
#5 0 0 1 0 0 0 0 0
#6 1 1 1 0 1 0 0 0
Using combn create all combinations of column names taking n columns at a time. For each of those combinations assign 1 to those rows where all the 3 combinations are 1 or 0 otherwise.
If you are just looking for a frequency of the combinations (and they don't need to be back in the original data), then you could use something like this:
df <- data.frame(a = c(0,1,0,1,0,1),
b = c(1,1,1,1,0,1),
c = c(0,0,0,1,1,1),
d = c(1,1,0,0,0,0))
n <- names(df)
out <- sapply(n, function(x)ifelse(df[[x]] == 1, x, ""))
combs <- apply(out, 1, paste, collapse="")
sort(table(combs))
# combs
# abd b bd c abc
# 1 1 1 1 2
Ok, so let's use your data, including one row without any 1's:
df <- data.frame(
a = c(0,1,0,1,0,1,0),
b = c(1,1,1,1,0,1,0),
c = c(0,0,0,1,1,1,0),
d = c(1,1,0,0,0,0,0)
)
Now I want to paste all column names together if they have a 1, and then make that a wide table (so that all have a column for a combination). Of course, I fill all resulting NAs with 0's.
df2 <- df %>%
dplyr::mutate(
combination = paste0(
ifelse(a == 1, "a", ""), # There is possibly a way to automate this as well using across()
ifelse(b == 1, "b", ""),
ifelse(c == 1, "c", ""),
ifelse(d == 1, "d", "")
),
combination = ifelse(
combination == "",
"nothing",
paste0("comb_", combination)
),
value = ifelse(
is.na(combination),
0,
1
),
i = dplyr::row_number()
) %>%
tidyr::pivot_wider(
names_from = combination,
values_from = value,
names_repair = "unique"
) %>%
replace(., is.na(.), 0) %>%
dplyr::select(-i)
Since you want to order the original df by frequency, you can create a summary of all combinations (excluding those without anything filled in). Then you just make it a long table and pull the column for every combination (arranged by frequency) from the table.
comb_in_order <- df2 %>%
dplyr::select(
-tidyselect::any_of(
c(
names(df),
"nothing" # I think you want these last.
)
)
) %>%
dplyr::summarise(
dplyr::across(
.cols = tidyselect::everything(),
.fns = sum
)
) %>%
tidyr::pivot_longer(
cols = tidyselect::everything(),
names_to = "combination",
values_to = "frequency"
) %>%
dplyr::arrange(
dplyr::desc(frequency)
) %>%
dplyr::pull(combination)
The only thing to do then is to reconstruct the original df by these after arranging by the columns.
df2 %>%
dplyr::arrange(
across(
tidyselect::any_of(comb_in_order),
desc
)
) %>%
dplyr::select(
tidyselect::any_of(names(df))
)
This should work for all possible combinations.

How to match characters change in R

I have below-mentioned dataframe in R:
ID source_field_1 field_1 source_field_3 field_3
ER-1 AC45U CD34I 1992-01-23 23/01/1992
ER-2 AB15X 1971-01-23 23/1/1971
ER-3 DB22U AC22Z 1962-11-13 3/11/1962
ER-4 CF12R BA23D 1992-01-23 23/01/1992
I need a group by count of change of characters from column source_field_1 to field_1, from A to Z and from 0 to 9.
Required Output:
source_field_1 A B C D E . . . Z
A 1
B 1
C 1 1
D 1
E
F 1
.
. 1
. 1
Z
Need the same structure for numerical characters as well for both field_1 and field_3.
df1 <- na.omit(df)
create <- function(from,to,nm)
{
s <- sprintf("[^%s]",paste0(nm,collapse = ""))
from <- unlist(strsplit(gsub(s,"",from),""))
to <- unlist(strsplit(gsub(s,"",to),""))
table(from,to)
}
create(df1$source_field_1,df1$field_1,0:9)
to
from 2 3 4
1 1 0 0
2 2 1 0
4 0 1 0
5 0 0 1
create(df1$source_field_1,df1$field_1,LETTERS)
to
from A B C D I Z
A 0 0 1 0 0 0
B 0 0 1 0 0 0
C 0 1 0 1 0 0
D 1 0 0 0 0 0
F 1 0 0 0 0 0
R 0 0 0 1 0 0
U 0 0 0 0 1 1
This is rather simple to achieve by splitting up each character and using the table function.
library(stringr)
df <- [your df]
out <- vector('list', nrow(df))
for(i in seq_along(out)){
#Split both columns
splitted_str <- str_split(unlist(df[i, c('source_field_1', 'field_1')]), '')
#Alternative in base R:
#gsub(LETTERS, '', unlist(df[i, c('source_field_1', 'field_1')]))
#convert to factors, "levels" will be used in our columns
splitted_str <- lapply(splitted_str, factor, levels = LETTERS)
#Create table. dnn sets the names shown for column/rows
out[[i]] <- table(splitted_str, dnn = c('source_field_1', 'field_1'))
}
note that i abuse the fact that factor(...) sets all values not in levels to NA, and by default table(...) excludes these in the table.
Obviously this could all be combined into a single line
out <- lapply(seq(nrow(df)),
function(x) table(lapply(str_split(unlist(df[i, c('source_field_1', 'field_1')]), ''), factor, levels = LETTERS), dnn = c('source_field_1', 'field_1'))
)

rbind list of arbitrary number of dataframes

I have a list of dataframes with some overlapping columns in each. The number of dataframes in the list is unknown. How can I efficiently, in base, rbind the dataframes together and fill in non overlapping columns with zeros?
Example data:
x <- data.frame(a=1:2, b=1:2, c=1:2)
y <- data.frame(a=1:2, r=1:2, f=1:2)
z <- data.frame(b=1:3, c=1:3, v=1:3, t=c("A", "A", "D"))
L1 <- list(x, y, z)
Desired output:
a b c f r t v
1 1 1 1 0 0 0 0
2 2 2 2 0 0 0 0
3 1 0 0 1 1 0 0
4 2 0 0 2 2 0 0
5 0 1 1 0 0 A 1
6 0 2 2 0 0 A 2
7 0 3 3 0 0 D 3
Pad out each data frame with the missing columns, then rbind them:
allnames <- unique(unlist(lapply(L1, names)))
do.call(rbind, lapply(L1, function(df) {
not <- allnames[!allnames %in% names(df)]
df[, not] <- 0
df
}))
I have an old (and probably inefficient) function that does this. I've made one modification here to allow the fill to be specified.
RBIND <- function(datalist, keep.rownames = TRUE, fill = NA) {
Len <- sapply(datalist, ncol)
if (all(diff(Len) == 0)) {
temp <- names(datalist[[1]])
if (all(sapply(datalist, function(x) names(x) %in% temp))) tryme <- "basic"
else tryme <- "complex"
}
else tryme <- "complex"
almost <- switch(
tryme,
basic = { do.call("rbind", datalist) },
complex = {
Names <- unique(unlist(lapply(datalist, names)))
NROWS <- c(0, cumsum(sapply(datalist, nrow)))
NROWS <- paste(NROWS[-length(NROWS)]+1, NROWS[-1], sep=":")
out <- lapply(1:length(datalist), function(x) {
emptyMat <- matrix(fill, nrow = nrow(datalist[[x]]), ncol = length(Names))
colnames(emptyMat) <- Names
emptyMat[, match(names(datalist[[x]]),
colnames(emptyMat))] <- as.matrix(datalist[[x]])
emptyMat
})
do.call("rbind", out)
})
Final <- as.data.frame(almost, row.names = 1:nrow(almost))
Final <- data.frame(lapply(Final, function(x) type.convert(as.character(x))))
if (isTRUE(keep.rownames)) {
row.names(Final) <- make.unique(unlist(lapply(datalist, row.names)))
}
Final
}
Here it is on your sample data.
RBIND(L1, fill = 0)
# a b c r f v t
# 1 1 1 1 0 0 0 0
# 2 2 2 2 0 0 0 0
# 1.1 1 0 0 1 1 0 0
# 2.1 2 0 0 2 2 0 0
# 1.2 0 1 1 0 0 1 A
# 2.2 0 2 2 0 0 2 A
# 3 0 3 3 0 0 3 D

Resources